In [87]:
import pandas as pd
import numpy as np

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

# We are using a random number generator similar to the #random library in python
# This is a Numpy Random Number Generator that can produce numbers in a  certain shape
# In this example 5 working days worth, over 3 weeks times 6 for some chilly weather (in Celsius)!

temperature_df = pd.DataFrame((np.random.randn(5, 3)*6), index=days,columns=['week one', 'week two', 'week three'])

print(temperature_df) 

            week one   week two  week three
Monday      0.606179 -12.344541    7.121184
Tuesday    -6.385815  -1.971156    9.076618
Wednesday  -6.524694  -4.129337    3.054195
Thursday   -6.298239 -16.792902  -10.056815
Friday     10.797105  -0.416172   -7.543754


In [88]:
# What about the weekend?

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

temperature_df = temperature_df.reindex(days)

print(temperature_df)

            week one   week two  week three
Monday      0.606179 -12.344541    7.121184
Tuesday    -6.385815  -1.971156    9.076618
Wednesday  -6.524694  -4.129337    3.054195
Thursday   -6.298239 -16.792902  -10.056815
Friday     10.797105  -0.416172   -7.543754
Saturday         NaN        NaN         NaN
Sunday           NaN        NaN         NaN


In [89]:
temperature_df.isnull()

Unnamed: 0,week one,week two,week three
Monday,False,False,False
Tuesday,False,False,False
Wednesday,False,False,False
Thursday,False,False,False
Friday,False,False,False
Saturday,True,True,True
Sunday,True,True,True


In [90]:
temperature_df.notnull()

Unnamed: 0,week one,week two,week three
Monday,True,True,True
Tuesday,True,True,True
Wednesday,True,True,True
Thursday,True,True,True
Friday,True,True,True
Saturday,False,False,False
Sunday,False,False,False


In [91]:
temperature_df.fillna(0)

Unnamed: 0,week one,week two,week three
Monday,0.606179,-12.344541,7.121184
Tuesday,-6.385815,-1.971156,9.076618
Wednesday,-6.524694,-4.129337,3.054195
Thursday,-6.298239,-16.792902,-10.056815
Friday,10.797105,-0.416172,-7.543754
Saturday,0.0,0.0,0.0
Sunday,0.0,0.0,0.0


In [92]:
temperature_df.fillna("missing")

Unnamed: 0,week one,week two,week three
Monday,0.606179,-12.344541,7.121184
Tuesday,-6.385815,-1.971156,9.076618
Wednesday,-6.524694,-4.129337,3.054195
Thursday,-6.298239,-16.792902,-10.056815
Friday,10.797105,-0.416172,-7.543754
Saturday,missing,missing,missing
Sunday,missing,missing,missing


In [93]:
temperature_df.fillna(method='pad')

Unnamed: 0,week one,week two,week three
Monday,0.606179,-12.344541,7.121184
Tuesday,-6.385815,-1.971156,9.076618
Wednesday,-6.524694,-4.129337,3.054195
Thursday,-6.298239,-16.792902,-10.056815
Friday,10.797105,-0.416172,-7.543754
Saturday,10.797105,-0.416172,-7.543754
Sunday,10.797105,-0.416172,-7.543754


In [94]:
temperature_df.fillna(method='bfill')

Unnamed: 0,week one,week two,week three
Monday,0.606179,-12.344541,7.121184
Tuesday,-6.385815,-1.971156,9.076618
Wednesday,-6.524694,-4.129337,3.054195
Thursday,-6.298239,-16.792902,-10.056815
Friday,10.797105,-0.416172,-7.543754
Saturday,,,
Sunday,,,


In [95]:
temperature_df.dropna()

Unnamed: 0,week one,week two,week three
Monday,0.606179,-12.344541,7.121184
Tuesday,-6.385815,-1.971156,9.076618
Wednesday,-6.524694,-4.129337,3.054195
Thursday,-6.298239,-16.792902,-10.056815
Friday,10.797105,-0.416172,-7.543754


In [96]:
for week in temperature_df:
    week_mean = temperature_df[week].mean()
    temperature_df[week] = temperature_df[week].fillna(week_mean)

temperature_df

Unnamed: 0,week one,week two,week three
Monday,0.606179,-12.344541,7.121184
Tuesday,-6.385815,-1.971156,9.076618
Wednesday,-6.524694,-4.129337,3.054195
Thursday,-6.298239,-16.792902,-10.056815
Friday,10.797105,-0.416172,-7.543754
Saturday,-1.561093,-7.130822,0.330286
Sunday,-1.561093,-7.130822,0.330286


In [97]:
review_service_dict = {
    "service" : ["spotify", "tidal", "apple music", "amazon music", "spotify"],
    "user_base" : [465000000, 6700000, 88000000, 82000000, 465000000],
    "review" : [9.4, 7, 8.4, 6.3, 9.4]
}

service_frame = pd.DataFrame(review_service_dict)

service_frame.drop_duplicates()

Unnamed: 0,service,user_base,review
0,spotify,465000000,9.4
1,tidal,6700000,7.0
2,apple music,88000000,8.4
3,amazon music,82000000,6.3


In [98]:
hap17 = pd.read_csv("2017.csv")
hap17 = pd.DataFrame(hap17)
# Is there any missing data?
hap17.isnull().values.any()
# Let's figure out which columns are missing data
for column in hap17:
    print(f"Column name: {column}")
    print(f"Is missing data: {hap17[column].isnull().any()} \n")

missing_countries = hap17[['Country', 'Happiness.Score']].loc[hap17['Happiness.Score'].isnull()]

print(missing_countries)



Column name: Country
Is missing data: False 

Column name: Happiness.Rank
Is missing data: False 

Column name: Happiness.Score
Is missing data: True 

Column name: Whisker.high
Is missing data: False 

Column name: Whisker.low
Is missing data: False 

Column name: Economy..GDP.per.Capita.
Is missing data: False 

Column name: Family
Is missing data: False 

Column name: Health..Life.Expectancy.
Is missing data: False 

Column name: Freedom
Is missing data: False 

Column name: Generosity
Is missing data: False 

Column name: Trust..Government.Corruption.
Is missing data: False 

Column name: Dystopia.Residual
Is missing data: False 

        Country  Happiness.Score
0        Norway              NaN
7   New Zealand              NaN
19        Chile              NaN
30       France              NaN


In [99]:
hap15 = pd.read_csv("2015.csv")
hap16 = pd.read_csv("2016.csv")
hap18 = pd.read_csv("2018.csv")

average_dict = {}

for country in missing_countries['Country']:
    print(country)
    country_mean = hap15['Happiness Score'].loc[hap15['Country']==country].mean(),hap16['Happiness Score'].loc[hap16['Country']==country].mean(),hap18['Score'].loc[hap18['Country or region']==country].mean()
    #Convert the tuple into a Panda Series in order to calculate the mean across the three years
    average_dict[country] = pd.Series(country_mean).mean()
    
print(average_dict)

# Replace the missing values with the ones you have found

hap17['Happiness.Score'] = hap17['Happiness.Score'].fillna(hap17['Country'].map(average_dict))

hap17

Norway
New Zealand
Chile
France
{'Norway': 7.538, 'New Zealand': 7.314666666666667, 'Chile': 6.617, 'France': 6.514}


Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.538,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.355280,0.400770,2.313707
2,Iceland,3,7.504,7.622030,7.385970,1.480633,1.610574,0.833552,0.627163,0.475540,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.564980,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182
...,...,...,...,...,...,...,...,...,...,...,...,...
150,Rwanda,151,3.471,3.543030,3.398970,0.368746,0.945707,0.326425,0.581844,0.252756,0.455220,0.540061
151,Syria,152,3.462,3.663669,3.260331,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347,1.061574
152,Tanzania,153,3.349,3.461430,3.236570,0.511136,1.041990,0.364509,0.390018,0.354256,0.066035,0.621130
153,Burundi,154,2.905,3.074690,2.735310,0.091623,0.629794,0.151611,0.059901,0.204435,0.084148,1.683024
