# Preparing data

### Reading DataFrames from multiple files in a loop

In [1]:
import pandas as pd

In [2]:
filenames = ['summer_olympic_medals/Silver.csv', 'summer_olympic_medals/Bronze.csv', 'summer_olympic_medals/Gold.csv']
dataframes = []

In [3]:
for filename in filenames:
    dataframes.append(pd.read_csv(filename))
    
dataframes[1].head()

Unnamed: 0,NOC,Country,Total
0,USA,United States,1052.0
1,URS,Soviet Union,584.0
2,GBR,United Kingdom,505.0
3,FRA,France,475.0
4,GER,Germany,454.0


### Combining DataFrames from multiple data files

In [4]:
medals = dataframes[2].copy()
new_labels = ['NOC', 'Country', 'Gold']
medals.columns = new_labels

In [5]:
medals['Silver'] = dataframes[0]['Total']
medals['Bronze'] = dataframes[1]['Total']
medals.head()

Unnamed: 0,NOC,Country,Gold,Silver,Bronze
0,USA,United States,2088.0,1195.0,1052.0
1,URS,Soviet Union,838.0,627.0,584.0
2,GBR,United Kingdom,498.0,591.0,505.0
3,FRA,France,378.0,461.0,475.0
4,GER,Germany,407.0,350.0,454.0


### Reindexing DataFrame from a list

In [6]:
weather_dict = {
    'Month':['Apr', 'Jan', 'Jul', 'Oct'],
    'Mean TemperatureF':[61.956044, 32.133333, 68.934783, 43.434783]
}
weather = pd.DataFrame(weather_dict)
weather = weather.set_index('Month')
weather

Unnamed: 0_level_0,Mean TemperatureF
Month,Unnamed: 1_level_1
Apr,61.956044
Jan,32.133333
Jul,68.934783
Oct,43.434783


In [7]:
year = ['Jan',
 'Feb',
 'Mar',
 'Apr',
 'May',
 'Jun',
 'Jul',
 'Aug',
 'Sep',
 'Oct',
 'Nov',
 'Dec']

In [8]:
weather2 = weather.reindex(year)
weather2

Unnamed: 0_level_0,Mean TemperatureF
Month,Unnamed: 1_level_1
Jan,32.133333
Feb,
Mar,
Apr,61.956044
May,
Jun,
Jul,68.934783
Aug,
Sep,
Oct,43.434783


In [9]:
weather3 = weather.reindex(year).ffill()
weather3

Unnamed: 0_level_0,Mean TemperatureF
Month,Unnamed: 1_level_1
Jan,32.133333
Feb,32.133333
Mar,32.133333
Apr,61.956044
May,61.956044
Jun,61.956044
Jul,68.934783
Aug,68.934783
Sep,68.934783
Oct,43.434783


### Reindeing using another DataFrame Index

In [25]:
names_1981 = pd.read_csv('baby_names/names1981.csv', header=None, names=['name','gender','count'], index_col=(0,1))
names_1881 = pd.read_csv('baby_names/names1881.csv', header=None, names=['name','gender','count'], index_col=(0,1))

In [26]:
names_1881.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
name,gender,Unnamed: 2_level_1
Mary,F,6919
Anna,F,2698
Emma,F,2034
Elizabeth,F,1852
Margaret,F,1658


In [27]:
names_1981.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
name,gender,Unnamed: 2_level_1
Jennifer,F,57032
Jessica,F,42519
Amanda,F,34370
Sarah,F,28162
Melissa,F,28003


In [28]:
print(names_1881.shape)
print(names_1981.shape)

(1935, 1)
(19455, 1)


In [29]:
common_names = names_1981.reindex(names_1881.index)
common_names.shape

(1935, 1)

In [30]:
common_names = common_names.dropna()
common_names.shape

(1587, 1)

### Broadcasting in arithmetic formulas

In [34]:
weather = pd.read_csv('pittsburgh2013.csv', index_col='Date', parse_dates=True)
weather.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 365 entries, 2013-01-01 to 2013-12-31
Data columns (total 22 columns):
Max TemperatureF             365 non-null int64
Mean TemperatureF            365 non-null int64
Min TemperatureF             365 non-null int64
Max Dew PointF               365 non-null int64
Mean Dew PointF              365 non-null int64
Min DewpointF                365 non-null int64
Max Humidity                 365 non-null int64
Mean Humidity                365 non-null int64
Min Humidity                 365 non-null int64
Max Sea Level PressureIn     365 non-null float64
Mean Sea Level PressureIn    365 non-null float64
Min Sea Level PressureIn     365 non-null float64
Max VisibilityMiles          365 non-null int64
Mean VisibilityMiles         365 non-null int64
Min VisibilityMiles          365 non-null int64
Max Wind SpeedMPH            365 non-null int64
Mean Wind SpeedMPH           365 non-null int64
Max Gust SpeedMPH            244 non-null float64
Prec

In [35]:
temps_f = weather[['Min TemperatureF', 'Mean TemperatureF', 'Max TemperatureF']]
temps_c = (temps_f - 32)*5/9
temps_c.columns = temps_c.columns.str.replace('F', 'C')
temps_c.head()

Unnamed: 0_level_0,Min TemperatureC,Mean TemperatureC,Max TemperatureC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-01,-6.111111,-2.222222,0.0
2013-01-02,-8.333333,-6.111111,-3.888889
2013-01-03,-8.888889,-4.444444,0.0
2013-01-04,-2.777778,-2.222222,-1.111111
2013-01-05,-3.888889,-1.111111,1.111111


### Computing percentage growth of GDP

In [37]:
gdp = pd.read_csv('GDP/gdp_usa.csv', parse_dates=True, index_col='DATE')
gdp.head()

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
1947-01-01,243.1
1947-04-01,246.3
1947-07-01,250.1
1947-10-01,260.3
1948-01-01,266.2


In [38]:
post2008 = gdp.loc['2008':,:]
yearly = post2008.resample('A').last()
yearly.head()

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
2008-12-31,14549.9
2009-12-31,14566.5
2010-12-31,15230.2
2011-12-31,15785.3
2012-12-31,16297.3


In [39]:
yearly['growth'] = yearly.pct_change() * 100
yearly

Unnamed: 0_level_0,VALUE,growth
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-12-31,14549.9,
2009-12-31,14566.5,0.11409
2010-12-31,15230.2,4.556345
2011-12-31,15785.3,3.644732
2012-12-31,16297.3,3.243524
2013-12-31,16999.9,4.311144
2014-12-31,17692.2,4.072377
2015-12-31,18222.8,2.999062
2016-12-31,18436.5,1.172707


### Converting currency of stocks

In [43]:
sp500 = pd.read_csv('sp500.csv', parse_dates=True, index_col='Date')
exchange = pd.read_csv('exchange.csv', parse_dates=True, index_col='Date')

In [44]:
sp500.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-02,2058.899902,2072.360107,2046.040039,2058.199951,2708700000,2058.199951
2015-01-05,2054.439941,2054.439941,2017.339966,2020.579956,3799120000,2020.579956
2015-01-06,2022.150024,2030.25,1992.439941,2002.609985,4460110000,2002.609985
2015-01-07,2005.550049,2029.609985,2005.550049,2025.900024,3805480000,2025.900024
2015-01-08,2030.609985,2064.080078,2030.609985,2062.139893,3934010000,2062.139893


In [45]:
exchange.head()

Unnamed: 0_level_0,GBP/USD
Date,Unnamed: 1_level_1
2015-01-02,0.65101
2015-01-05,0.65644
2015-01-06,0.65896
2015-01-07,0.66344
2015-01-08,0.66151


In [47]:
dollars = sp500[['Open', 'Close']]
dollars.head()

Unnamed: 0_level_0,Open,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-02,2058.899902,2058.199951
2015-01-05,2054.439941,2020.579956
2015-01-06,2022.150024,2002.609985
2015-01-07,2005.550049,2025.900024
2015-01-08,2030.609985,2062.139893


In [50]:
pounds = dollars.multiply(exchange['GBP/USD'], axis='rows')
pounds.head()

Unnamed: 0_level_0,Open,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-02,1340.364425,1339.90875
2015-01-05,1348.616555,1326.389506
2015-01-06,1332.51598,1319.639876
2015-01-07,1330.562125,1344.063112
2015-01-08,1343.268811,1364.126161
