In [55]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

## Data Introduction

The data utilized for this study is data pulled from three sources.

* Tokyo Olympics Dataset (Webscrapping)
* GDP Dataset (Webscrapping)
* Tokyo Olympics Dataset Teams (Kaggle)

For the class DS5100 in the Program for MSDS at University of Virginia, explored ways to analysis the data pulled from these three sources. From the sources we would like to find how many Gold Medals a nation will win based on features pulled from these datasets. Some features we will look at are number of silver and bronze medals, continent the countries reside in, GDP, Population, and GDP per Capita.

## Data Preparation
Although a pipeline was created we need to prepare the data to fit a multiple linear model as shown below

In [73]:
original = pd.read_csv('../../data/final_olympic_cont.csv', index_col=[0])

## Transform and drop some unused columns
df = original.drop(['Total', 'Name', 'Country', 'GDP abbreviated', 'NOC'], axis=1)

## Drop NaN
df = df.dropna()

# Transform Variable
df["GDP"] = df["GDP"].map(lambda x: int(x[1:].replace(",","")))
df["GDP per capita"] = df["GDP per capita"].map(lambda x: int(x[1:].replace(",","")))
df["Population"] = df["Population"].map(lambda x: int(x.replace(",","")))
df["GDP growth"] = df["GDP growth"].map(lambda x: x[:-1]).astype(float)
df.head()

Unnamed: 0,Gold,Silver,Bronze,GDP,GDP growth,Population,GDP per capita,Discipline,Continents
0,39,41,33,19485394000000,2.27,325084756,59939,47,North America
1,38,32,18,12237700479375,6.9,1421021791,8612,33,Asia
2,27,14,17,4872415104315,1.71,127502725,38214,48,Asia
3,22,21,22,2637866340434,1.79,66727461,39532,28,Europe
4,20,28,23,1578417211937,1.55,145530082,10846,34,Asia


In [50]:
## Transform Categorical Variables
df = pd.get_dummies(df,drop_first=True)

In [51]:
## Attributes vs Response Variable
x = df.drop(['Gold'],axis=1)
#separte the predicting attribute into Y for model training 
y = df['Gold']
print(x.head())
print(y[0:5])

   Silver  Bronze             GDP  GDP growth  Population  GDP per capita  \
0      41      33  19485394000000        2.27   325084756           59939   
1      32      18  12237700479375        6.90  1421021791            8612   
2      14      17   4872415104315        1.71   127502725           38214   
3      21      22   2637866340434        1.79    66727461           39532   
4      28      23   1578417211937        1.55   145530082           10846   

   Discipline  Continents_Asia  Continents_Australia  Continents_Europe  \
0          47                0                     0                  0   
1          33                1                     0                  0   
2          48                1                     0                  0   
3          28                0                     0                  1   
4          34                1                     0                  0   

   Continents_North America  Continents_South America  
0                         1   

In [52]:
## Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

## Running a linear model

In [53]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [54]:
y_prediction =  LR.predict(x_test)
y_prediction

array([ 4.81718702, 67.06784382,  0.836584  , 21.42443177,  2.5446173 ,
        2.56349605,  9.2271289 ,  0.67823442,  1.88802242,  6.11548022,
       -0.1165528 , 11.20332434,  0.61331336, 10.31856607, -0.24658078])

## Analysis of Linear Model
Statistics for the linear model performance

In [66]:
r_squared =r2_score(y_test,y_prediction)
print("R^2 = {}".format(r_squared))
mean_squared_error_val = mean_squared_error(y_test,y_prediction)
print("Mean Squared Error = {}".format(mean_squared_error_val))
rmse = np.sqrt(mean_squared_error(y_test,y_prediction))
print("Root Mean Squared Error = {}".format(rmse))

R^2 = 0.44637746693029345
Mean Squared Error = 57.84740334430577
Root Mean Squared Error = 7.6057480463334945


## Side by Side comparison
Displayed in the Results of the testing set with the actual values.

In [93]:
display = original.copy()
display = display.iloc[x_test.index]
display["Prediction"] = y_prediction
display.head()

Unnamed: 0,Name,Gold,Silver,Bronze,Total,Country,GDP,GDP abbreviated,GDP growth,Population,GDP per capita,NOC,Discipline,Continents,Prediction
22,SWE Sweden,3,6,0,9,Sweden,"$535,607,385,506",$536 billion,2.29%,9904896,"$54,075",Sweden,6,Europe,4.817187
0,USA United States of America,39,41,33,113,United States,"$19,485,394,000,000",$19.485 trillion,2.27%,325084756,"$59,939",United States of America,47,North America,67.067844
51,POR Portugal,1,1,2,4,Portugal,"$219,308,128,887",$219 billion,2.68%,10288527,"$21,316",Portugal,2,Europe,0.836584
4,ROC ROC,20,28,23,71,Russia,"$1,578,417,211,937",$1.578 trillion,1.55%,145530082,"$10,846",ROC,34,Asia,21.424432
58,COL Colombia,0,4,1,5,Colombia,"$314,457,601,860",$314 billion,1.79%,48909839,"$6,429",Colombia,3,South America,2.544617
18,KEN Kenya,4,4,2,10,Kenya,"$79,263,075,749",$79.26 billion,4.87%,50221142,"$1,578",Kenya,4,Africa,2.563496
10,CAN Canada,7,6,11,24,Canada,"$1,647,120,175,449",$1.647 trillion,3.05%,36732095,"$44,841",Canada,30,North America,9.227129
34,GRE Greece,2,1,1,4,Greece,"$203,085,551,429",$203 billion,1.35%,10569450,"$19,214",Greece,6,Europe,0.678234
48,AUT Austria,1,1,5,7,Austria,"$416,835,975,862",$417 billion,3.04%,8819901,"$47,261",Austria,5,Europe,1.888022
12,NZL New Zealand,7,6,7,20,New Zealand,"$204,139,049,909",$204 billion,3.03%,4702034,"$43,415",New Zealand,13,Australia,6.11548


## Side by Side Comparison

In [95]:
display[["Name", "Gold", "Prediction"]]

Unnamed: 0,Name,Gold,Prediction
22,SWE Sweden,3,4.817187
0,USA United States of America,39,67.067844
51,POR Portugal,1,0.836584
4,ROC ROC,20,21.424432
58,COL Colombia,0,2.544617
18,KEN Kenya,4,2.563496
10,CAN Canada,7,9.227129
34,GRE Greece,2,0.678234
48,AUT Austria,1,1.888022
12,NZL New Zealand,7,6.11548
