# Datasets

## Fetching Data From Library

In [4]:
from sklearn.datasets import load_boston
import pandas as pd

In [6]:
data, target = load_boston(True)

In [18]:
bostonDF = pd.DataFrame(data=data,columns=load_boston().feature_names)
bostonDF["label"] = target

In [19]:
bostonDF.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,label
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


## Fetching Data From Web

This dataset records the games purchases for given customers, the label represents if the customers bought or not a new video game. 

In [20]:
gamesDF = pd.read_csv("https://github.com/bgweber/Twitch/raw/master/Recommendations/games-expand.csv")

In [22]:
gamesDF.head()

Unnamed: 0,G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,label
0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0
3,0,0,1,0,0,1,1,0,0,1,1
4,0,0,1,0,1,1,0,1,1,0,1


## BigQuery to Pandas

After creating the project and the credentials json file, set the environment variable

In [1]:
!export GOOGLE_APPLICATION_CREDENTIALS=/home/ubuntu/mlops/dsdemo.json

In [3]:
from google.cloud import bigquery
client = bigquery.Client()
sql = """
SELECT *
FROM
`bigquery-public-data.samples.natality`
LIMIT 10
"""
natalityDF = client.query(sql).to_dataframe()
natalityDF.head()

Unnamed: 0,source_year,year,month,day,wday,state,is_male,child_race,weight_pounds,plurality,...,alcohol_use,drinks_per_week,weight_gain_pounds,born_alive_alive,born_alive_dead,born_dead,ever_born,father_race,father_age,record_weight
0,2005,2005,7,,3.0,,False,,8.628893,1.0,...,False,,57.0,9.0,0.0,0.0,10,78,38,1
1,2005,2005,4,,6.0,,True,,2.678616,1.0,...,False,,23.0,7.0,0.0,0.0,8,78,39,1
2,2006,2006,5,,1.0,,True,,11.062796,1.0,...,False,,11.0,,,,8,68,41,1
3,2007,2007,3,,2.0,,False,,5.436599,2.0,...,False,,10.0,,,,8,78,42,1
4,2007,2007,4,,7.0,,False,,3.560466,1.0,...,False,,18.0,,,,8,78,43,1


## Kaggle To Pandas

In [15]:
nhlDF = pd.read_csv("./nhl-game-data/game.csv")
nhlDF.head()

Unnamed: 0,game_id,season,type,date_time,date_time_GMT,away_team_id,home_team_id,away_goals,home_goals,outcome,home_rink_side_start,venue,venue_link,venue_time_zone_id,venue_time_zone_offset,venue_time_zone_tz
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,3,4,home win OT,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,4,1,away win REG,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,3,4,home win OT,left,Prudential Center,/api/v1/venues/null,America/New_York,-4,EDT
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,2,4,home win REG,left,Prudential Center,/api/v1/venues/null,America/New_York,-4,EDT
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,3,1,away win REG,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT


# Prototype Models

## Linear Regressor

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [3]:
import pandas as pd

In [6]:
from sklearn.datasets import load_boston
data, target = load_boston(True)
bostonDF = pd.DataFrame(data=data, columns=load_boston().feature_names)

In [8]:
bostonDF["label"] = target

In [10]:
bostonDF.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,label
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [12]:
x_train, x_test, y_train, y_test = train_test_split(bostonDF.drop(['label'], axis=1), 
                                                    bostonDF['label'], test_size=0.3)

In [13]:
model = LinearRegression()

In [14]:
model.score?

In [15]:
model.fit(x_train, y_train)

LinearRegression()

In [27]:
from sklearn.metrics import mean_absolute_error

In [28]:
print(f"R^2: {model.score(x_test, y_test):.5f}")
print(f"Mean Error: {mean_absolute_error(model.predict(x_test),y_test):.5f}")

R^2: 0.73812
Mean Error: 3.22215


## Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pandas as pd

In [32]:
gamesDF = pd.read_csv("https://github.com/bgweber/Twitch/raw/master/Recommendations/games-expand.csv")

In [34]:
x_train, x_test, y_train, y_test = train_test_split(gamesDF.drop(['label'], axis=1), 
                                                    gamesDF['label'], test_size=0.3)

In [35]:
model = LogisticRegression()
model.fit(x_train, y_train)

LogisticRegression()

In [56]:
print(f"Accuracy: {model.score(x_test, y_test):.5f}")
print(f"ROC: {roc_auc_score(y_test.values,model.predict_proba(x_test)[:,1]):.5f}")

R^2: 0.87282
ROC: 0.75817
