# Linear Regression Model

## 1. Multiple Variables

### Sales based on ESRB_Rating and Critic_Score
Predict game's sale based ESRB_Rating
<br>Example: If ESRB_Rating = E, how likely will the sales be?

- model = LinearRegression()
- Target variable (y): Total_Sales
- Independent variable (X): ESRB_Rating

In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
games_df = pd.read_csv('Cleaned_Data/all_columns_df.csv')
games_df

Unnamed: 0,Rank,Name,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,User_Score,Year,Country,Total_Sales
0,1,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,2006.0,Japan,82.86
1,2,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,10.0,,1985.0,Japan,40.24
2,3,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,2008.0,Japan,37.14
3,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,2017.0,,36.60
4,5,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,8.8,2009.0,Japan,33.09
...,...,...,...,...,...,...,...,...,...,...,...,...
19857,19858,FirePower for Microsoft Combat Flight Simulator 3,Simulation,T,PC,GMX Media,Shockwave Productions,,,2004.0,,0.01
19858,19859,Tom Clancy's Splinter Cell,Shooter,T,PC,Ubisoft,Ubisoft,,,2003.0,Europe,0.01
19859,19860,Ashita no Joe 2: The Anime Super Remix,Fighting,,PS2,Capcom,Capcom,,,2002.0,Japan,0.01
19860,19861,Tokyo Yamanote Boys for V: Main Disc,Adventure,,PSV,Rejet,Rejet,,,2017.0,,0.01


In [3]:
games_df.isna().sum()

Rank                0
Name                0
Genre               0
ESRB_Rating      5937
Platform            0
Publisher           0
Developer_x         2
Critic_Score    15156
User_Score      19624
Year                3
Country          7985
Total_Sales         0
dtype: int64

In [4]:
games_df.nunique()

Rank            19862
Name            13840
Genre              20
ESRB_Rating         6
Platform           40
Publisher         926
Developer_x      3185
Critic_Score       86
User_Score         44
Year               45
Country            20
Total_Sales       648
dtype: int64

In [9]:
games_df.drop(['Rank', 'Name', 'User_Score'], axis=1, inplace=True)
games_df

Unnamed: 0,Genre,ESRB_Rating,Platform,Publisher,Developer_x,Critic_Score,Year,Country,Total_Sales
0,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,2006.0,Japan,82.86
1,Platform,,NES,Nintendo,Nintendo EAD,10.0,1985.0,Japan,40.24
2,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,2008.0,Japan,37.14
3,Shooter,,PC,PUBG Corporation,PUBG Corporation,,2017.0,,36.60
4,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,2009.0,Japan,33.09
...,...,...,...,...,...,...,...,...,...
19857,Simulation,T,PC,GMX Media,Shockwave Productions,,2004.0,,0.01
19858,Shooter,T,PC,Ubisoft,Ubisoft,,2003.0,Europe,0.01
19859,Fighting,,PS2,Capcom,Capcom,,2002.0,Japan,0.01
19860,Adventure,,PSV,Rejet,Rejet,,2017.0,,0.01


In [10]:
# To see the row count if drop NaN in all columns
games_df.dropna().count()

Genre           3579
ESRB_Rating     3579
Platform        3579
Publisher       3579
Developer_x     3579
Critic_Score    3579
Year            3579
Country         3579
Total_Sales     3579
dtype: int64

In [18]:
# Choose columns 'Total_Sales', 'ESRB_Rating', and 'Critic_Score'
predict_df = games_df[['Total_Sales','ESRB_Rating', 'Critic_Score']]
predict_df

Unnamed: 0,Total_Sales,ESRB_Rating,Critic_Score
0,82.86,E,7.7
1,40.24,,10.0
2,37.14,E,8.2
3,36.60,,
4,33.09,E,8.0
...,...,...,...
19857,0.01,T,
19858,0.01,T,
19859,0.01,,
19860,0.01,,


In [19]:
# Drop NaNs in predict_df
predict_df.dropna(inplace=True)
predict_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predict_df.dropna(inplace=True)


Unnamed: 0,Total_Sales,ESRB_Rating,Critic_Score
0,82.86,E,7.7
2,37.14,E,8.2
4,33.09,E,8.0
5,31.38,E,9.4
6,30.80,E,9.1
...,...,...,...
19790,0.01,T,8.4
19792,0.01,T,7.0
19794,0.01,E,6.0
19800,0.01,E,6.7


In [20]:
predict_df.ESRB_Rating.nunique()

5

In [21]:
cat = predict_df.dtypes[predict_df.dtypes == "object"].index.tolist()
cat

['ESRB_Rating']

In [22]:
# Encode 'ESRB_Rating' column
from sklearn.preprocessing import OneHotEncoder

# creating instance of one-hot-encoder
enc = OneHotEncoder(sparse=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(predict_df[cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(cat)

encode_df.head()



Unnamed: 0,ESRB_Rating_E,ESRB_Rating_E10,ESRB_Rating_M,ESRB_Rating_RP,ESRB_Rating_T
0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0


In [23]:
encode_df.tail()

Unnamed: 0,ESRB_Rating_E,ESRB_Rating_E10,ESRB_Rating_M,ESRB_Rating_RP,ESRB_Rating_T
4563,0.0,0.0,0.0,0.0,1.0
4564,0.0,0.0,0.0,0.0,1.0
4565,1.0,0.0,0.0,0.0,0.0
4566,1.0,0.0,0.0,0.0,0.0
4567,0.0,0.0,0.0,0.0,1.0


In [24]:
# Merge one-hot encoded features and drop the originals
predict_df = predict_df.merge(encode_df,left_index=True, right_index=True)
predict_df = predict_df.drop(cat,1)
predict_df.head()

  predict_df = predict_df.drop(cat,1)


Unnamed: 0,Total_Sales,Critic_Score,ESRB_Rating_E,ESRB_Rating_E10,ESRB_Rating_M,ESRB_Rating_RP,ESRB_Rating_T
0,82.86,7.7,1.0,0.0,0.0,0.0,0.0
2,37.14,8.2,1.0,0.0,0.0,0.0,0.0
4,33.09,8.0,1.0,0.0,0.0,0.0,0.0
5,31.38,9.4,1.0,0.0,0.0,0.0,0.0
6,30.8,9.1,1.0,0.0,0.0,0.0,0.0


In [25]:
# Assign target variable and independent variable
X = predict_df.iloc[:, 1:7]
y = predict_df.Total_Sales.values

In [26]:
# Split data into testing and training set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

In [27]:
# Create and train the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [28]:
# Test the model
predictions = model.predict(X_test)

In [29]:
from sklearn.metrics import r2_score
r_squared = r2_score(y_test, predictions)
r_squared

0.07663700683466668