In [20]:
#Import Packages
import pandas as pd
import numpy as np
import rpy2.robjects as robj
import rpy2.robjects.pandas2ri # for dataframe conversion
from rpy2.robjects.packages import importr
import matplotlib.pyplot as plt

In [10]:
# Import Relevant CSV Files
cases = pd.read_csv('data/Originals/reported_indigenous_confirmed_cases.csv')
m_deaths = pd.read_csv('data/Originals/reported_deaths.csv')


In [11]:
cases.columns

Index(['GHO (CODE)', 'GHO (DISPLAY)', 'GHO (URL)', 'PUBLISHSTATE (CODE)',
       'PUBLISHSTATE (DISPLAY)', 'PUBLISHSTATE (URL)', 'YEAR (CODE)',
       'YEAR (DISPLAY)', 'YEAR (URL)', 'REGION (CODE)', 'REGION (DISPLAY)',
       'REGION (URL)', 'COUNTRY (CODE)', 'COUNTRY (DISPLAY)', 'COUNTRY (URL)',
       'Display Value', 'Numeric', 'Low', 'High', 'Comments'],
      dtype='object')

In [12]:
cases = cases.rename(columns={'YEAR (CODE)':'year',
                    'REGION (CODE)':'region',
                    'COUNTRY (CODE)':'country_code',
                    'REGION (CODE)' : 'region',
                     'COUNTRY (DISPLAY)' : 'country_name',         
                    'Numeric':'reported_cases'})

cases = cases[['year', 'country_code','reported_cases']]

print('Years from:', '', cases['year'].min(), '-', cases['year'].max())
print(cases.isnull().any())
print(cases.head())
print(cases.info())

Years from:  2000 - 2014
year              False
country_code      False
reported_cases    False
dtype: bool
   year country_code  reported_cases
0  2001          EGY             0.0
1  2003          EGY             0.0
2  2013          IRQ             0.0
3  2004          OMN             0.0
4  2012          SYR             0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392 entries, 0 to 1391
Data columns (total 3 columns):
year              1392 non-null int64
country_code      1392 non-null object
reported_cases    1392 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 32.7+ KB
None


In [13]:
m_deaths.columns

Index(['GHO (CODE)', 'GHO (DISPLAY)', 'GHO (URL)', 'PUBLISHSTATE (CODE)',
       'PUBLISHSTATE (DISPLAY)', 'PUBLISHSTATE (URL)', 'YEAR (CODE)',
       'YEAR (DISPLAY)', 'YEAR (URL)', 'REGION (CODE)', 'REGION (DISPLAY)',
       'REGION (URL)', 'COUNTRY (CODE)', 'COUNTRY (DISPLAY)', 'COUNTRY (URL)',
       'Display Value', 'Numeric', 'Low', 'High', 'Comments'],
      dtype='object')

In [14]:
m_deaths = m_deaths.rename(columns={'YEAR (CODE)':'year',
                    'REGION (CODE)':'region',
                    'COUNTRY (CODE)':'country_code',
                    'Numeric':'reported_deaths'})
m_deaths = m_deaths[['year', 'country_code','reported_deaths']]

print('Years from:', '', m_deaths['year'].min(), '-', m_deaths['year'].max())
print(m_deaths.isnull().any())
print(m_deaths.head())
print(m_deaths.info())

Years from:  2000 - 2014
year               False
country_code       False
reported_deaths    False
dtype: bool
   year country_code  reported_deaths
0  2008          DZA              0.0
1  2012          DZA              0.0
2  2014          DZA              0.0
3  2001          CPV              0.0
4  2013          CPV              0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408 entries, 0 to 1407
Data columns (total 3 columns):
year               1408 non-null int64
country_code       1408 non-null object
reported_deaths    1408 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 33.1+ KB
None


In [25]:
# First, make some random data
x = np.random.normal(loc = 5, scale = 2, size = 10)
y = x + np.random.normal(loc = 0, scale = 2, size = 10)
 
# Make these into a pandas dataframe. I do this because
# more often than not, I read in a pandas dataframe, so this
# shows how to use a pandas dataframe to plot in ggplot
testData = pd.DataFrame( {'x':x, 'y':y} )
# it looks just like a dataframe from R
print(testData)
 
# Next, you make an robject containing function that makes the plot.
# the language in the function is pure R, so it can be anything
# note that the R environment is blank to start, so ggplot2 has to be
# loaded
plotFunc = robj.r("""
 library(ggplot2)
 
function(df){
 p <- ggplot(df, aes(x, y)) +
 geom_point( )
 
print(p)
 }
""")
 
# import graphics devices. This is necessary to shut the graph off
# otherwise it just hangs and freezes python
gr = importr('grDevices')
 
# convert the testData to an R dataframe
robj.pandas2ri.activate()
testData_R = robj.conversion.py2ri(testData)
 
# run the plot function on the dataframe
plotFunc(testData_R)
 
# ask for input. This requires you to press enter, otherwise the plot
# window closes immediately
raw_input()
 
# shut down the window using dev_off()
gr.dev_off()
 
# you can even save the output once you like it
plotFunc_2 = robj.r("""
 library(ggplot2)
 
function(df){
 p <- ggplot(df, aes(x, y)) +
 geom_point( ) +
 theme(
 panel.background = element_rect(fill = NA, color = 'black')
 )
 
ggsave('rpy2_magic.pdf', plot = p, width = 6.5, height = 5.5)
 }
""")
 
plotFunc_2(testData_R)


          x         y
0  5.671309  3.618337
1  6.896558  4.650572
2  7.596034  6.297825
3  5.057157  5.681918
4  6.544976  3.796347
5  3.116228  5.290630
6  6.533765  8.386398
7  7.147304  7.706238
8  3.490244  1.785937
9  1.443603  1.931084


NameError: name 'raw_input' is not defined

In [None]:
library(ggplot2)
library(ggmap)
library(maps)
library(data.table)

GlobalLandTemperaturesByCity <- fread("../input/GlobalLandTemperaturesByCity.csv")

#Create some useful data points
GlobalLandTemperaturesByCity$dt<-as.Date(GlobalLandTemperaturesByCity$dt,"%Y-%m-%d")
GlobalLandTemperaturesByCity$lat<-as.numeric(gsub("N|E|S|W", "",GlobalLandTemperaturesByCity$Latitude))*ifelse(grepl("S",GlobalLandTemperaturesByCity$Latitude),-1,1)
GlobalLandTemperaturesByCity$long<-as.numeric(gsub("N|E|S|W", "", GlobalLandTemperaturesByCity$Longitude))*ifelse(grepl("W",GlobalLandTemperaturesByCity$Longitude),-1,1)
GlobalLandTemperaturesByCity$Month<-as.numeric(format(GlobalLandTemperaturesByCity$dt,"%m"))
GlobalLandTemperaturesByCity$Year<-as.numeric(format(GlobalLandTemperaturesByCity$dt,"%Y"))

setkey(GlobalLandTemperaturesByCity,long,lat,Month,Year)

#Create a meta dataset of each city to add new features...

meta.city<-unique(GlobalLandTemperaturesByCity[,c(4,8:10),with=FALSE],by=c("Month","long","lat","City"))
setkey(meta.city,long,lat,Month)

meta.city.length<-length(meta.city$City)
meta.city$intercept.coef<-numeric(meta.city.length)
meta.city$year.coef<-numeric(meta.city.length)

#Create a data table for faster subsetting. Data before 1880 is rejected (uncertainty is too high)
dt <- as.data.table(na.omit(subset(GlobalLandTemperaturesByCity,Year>1880)))

#This loop will fill in the columns of the meta.city table.
for(i in 1:meta.city.length){
  dt.subset<-dt[list(meta.city$long[i],meta.city$lat[i],meta.city$Month[i]),]
  lmfit<-with(dt.subset,lm.fit(x=cbind(1,Year),y=AverageTemperature))
  meta.city$intercept.coef[i]<-lmfit$coefficients[1]
  meta.city$year.coef[i]<-lmfit$coefficients[2]
}

ggplot()+borders("world",colour="grey75",fill="black")+
  theme(panel.background=element_rect(fill = "gray93"))+
  geom_point(data=subset(meta.city, Month==1),aes(x=long,y=lat,colour=year.coef),size=3)+
  scale_colour_gradient(low="yellow",high ="red")+
  ggtitle("Average Annual Increase in Temperature - January")+
  labs(colour='Average Annual \nTemperature Increase (°C)')+xlab("Longitude")+ylab("Latitude")