In [2]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `video_game_sales.csv` data from the `Resources` folder into a Pandas DataFrame.

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
video_game_sales_df = pd.read_csv('Resources/video_game_sales.csv')

# Review the DataFrame
video_game_sales_df.head(5)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [4]:
# Count unique variables
count_unique_names = video_game_sales_df['Name'].nunique()
count_unique_names

11493

In [5]:
# Drop null values
updated_df = video_game_sales_df.dropna()
updated_df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [6]:
# Drop extra columns
cleaned_df = updated_df.drop(columns=['Rank', 'Name', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'])
cleaned_df

Unnamed: 0,Platform,Year,Genre,Publisher,Global_Sales
0,Wii,2006.0,Sports,Nintendo,82.74
1,NES,1985.0,Platform,Nintendo,40.24
2,Wii,2008.0,Racing,Nintendo,35.82
3,Wii,2009.0,Sports,Nintendo,33.00
4,GB,1996.0,Role-Playing,Nintendo,31.37
...,...,...,...,...,...
16593,GBA,2002.0,Platform,Kemco,0.01
16594,GC,2003.0,Shooter,Infogrames,0.01
16595,PS2,2008.0,Racing,Activision,0.01
16596,DS,2010.0,Puzzle,7G//AMES,0.01


In [7]:
platform_names = cleaned_df['Platform'].value_counts()
platform_names

Platform
DS      2131
PS2     2127
PS3     1304
Wii     1290
X360    1234
PSP     1197
PS      1189
PC       938
XB       803
GBA      786
GC       542
3DS      499
PSV      410
PS4      336
N64      316
SNES     239
XOne     213
SAT      173
WiiU     143
2600     116
NES       98
GB        97
DC        52
GEN       27
NG        12
SCD        6
WS         6
3DO        3
TG16       2
GG         1
PCFX       1
Name: count, dtype: int64

In [7]:
# Determine which values to replace if counts are less than or equal to 5.
names_to_replace = list(platform_names[platform_names < 200].index)

# Replace in dataframe
for app in names_to_replace:
    cleaned_df['Platform'] = cleaned_df['Platform'].replace(app,"Other")
    
# Check to make sure binning was successful
cleaned_df['Platform'].value_counts()


Platform
DS       2131
PS2      2127
PS3      1304
Wii      1290
X360     1234
PSP      1197
PS       1189
PC        938
XB        803
GBA       786
Other     737
GC        542
3DS       499
PSV       410
PS4       336
N64       316
SNES      239
XOne      213
Name: count, dtype: int64

In [8]:
publisher_names = cleaned_df['Publisher'].value_counts()
publisher_names[0:40]

Publisher
Electronic Arts                           1339
Activision                                 966
Namco Bandai Games                         928
Ubisoft                                    918
Konami Digital Entertainment               823
THQ                                        712
Nintendo                                   696
Sony Computer Entertainment                682
Sega                                       632
Take-Two Interactive                       412
Capcom                                     376
Atari                                      347
Tecmo Koei                                 338
Square Enix                                231
Warner Bros. Interactive Entertainment     217
Disney Interactive Studios                 214
Midway Games                               196
Eidos Interactive                          196
505 Games                                  192
Microsoft Game Studios                     189
Acclaim Entertainment                      184
D3P

In [9]:
# Determine which values to replace if counts are less than or equal to 5.
names_to_replace = list(publisher_names[publisher_names < 200].index)

# Replace in dataframe
for app in names_to_replace:
    cleaned_df['Publisher'] = cleaned_df['Publisher'].replace(app,"Other")
    
# Check to make sure binning was successful
cleaned_df['Publisher'].value_counts()

Publisher
Other                                     6460
Electronic Arts                           1339
Activision                                 966
Namco Bandai Games                         928
Ubisoft                                    918
Konami Digital Entertainment               823
THQ                                        712
Nintendo                                   696
Sony Computer Entertainment                682
Sega                                       632
Take-Two Interactive                       412
Capcom                                     376
Atari                                      347
Tecmo Koei                                 338
Square Enix                                231
Warner Bros. Interactive Entertainment     217
Disney Interactive Studios                 214
Name: count, dtype: int64

In [10]:
genre_names = cleaned_df['Genre'].value_counts()
genre_names

Genre
Action          3251
Sports          2304
Misc            1686
Role-Playing    1470
Shooter         1282
Adventure       1274
Racing          1225
Platform         875
Simulation       848
Fighting         836
Strategy         670
Puzzle           570
Name: count, dtype: int64

In [11]:
cleaned_df = pd.get_dummies(cleaned_df,columns=['Platform', 'Genre', 'Publisher'],drop_first=True)
cleaned_df

Unnamed: 0,Year,Global_Sales,Platform_DS,Platform_GBA,Platform_GC,Platform_N64,Platform_Other,Platform_PC,Platform_PS,Platform_PS2,...,Publisher_Nintendo,Publisher_Other,Publisher_Sega,Publisher_Sony Computer Entertainment,Publisher_Square Enix,Publisher_THQ,Publisher_Take-Two Interactive,Publisher_Tecmo Koei,Publisher_Ubisoft,Publisher_Warner Bros. Interactive Entertainment
0,2006.0,82.74,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1,1985.0,40.24,False,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,2008.0,35.82,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,2009.0,33.00,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,1996.0,31.37,False,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16593,2002.0,0.01,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
16594,2003.0,0.01,False,False,True,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
16595,2008.0,0.01,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
16596,2010.0,0.01,True,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False


In [12]:
# Create the X set by using the `reshape` function to format the ads data as a single column array.
X = cleaned_df.drop('Global_Sales',axis=1)

# Display sample data
X[:5]

Unnamed: 0,Year,Platform_DS,Platform_GBA,Platform_GC,Platform_N64,Platform_Other,Platform_PC,Platform_PS,Platform_PS2,Platform_PS3,...,Publisher_Nintendo,Publisher_Other,Publisher_Sega,Publisher_Sony Computer Entertainment,Publisher_Square Enix,Publisher_THQ,Publisher_Take-Two Interactive,Publisher_Tecmo Koei,Publisher_Ubisoft,Publisher_Warner Bros. Interactive Entertainment
0,2006.0,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1,1985.0,False,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,2008.0,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,2009.0,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,1996.0,False,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [13]:
# Create an array for the dependent variable y with the sales data
y = cleaned_df["Global_Sales"]

In [14]:
# Create a model with scikit-learn
model = LinearRegression()

In [15]:
X.shape

(16291, 45)

In [16]:
# Fit the data into the model
model.fit(X, y)

LinearRegression()

In [17]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [-0.04186393 -0.07372681 -0.47574144 -0.49909746 -0.48656895 -0.26406037
 -0.10187979 -0.20083962  0.03873267  0.38406937  0.69178885 -0.06826651
  0.10542584 -0.37551605  0.23929358  0.39956085 -0.30945897  0.47659908
 -0.20562938  0.0215821  -0.11558575  0.22197869 -0.20721128  0.00736811
  0.08806322  0.18042265 -0.04938702 -0.0888981  -0.23497856 -0.33896977
 -0.24926535 -0.09178259  0.10361074 -0.34336641 -0.38925059  1.94593889
 -0.39406113 -0.29846101  0.05983664 -0.12034024 -0.18222831  0.22929205
 -0.56743326 -0.16544122 -0.01229438]


In [18]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 84.722536522358


In [20]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_}X")

Model's formula: y = 84.722536522358 + [-0.04186393 -0.07372681 -0.47574144 -0.49909746 -0.48656895 -0.26406037
 -0.10187979 -0.20083962  0.03873267  0.38406937  0.69178885 -0.06826651
  0.10542584 -0.37551605  0.23929358  0.39956085 -0.30945897  0.47659908
 -0.20562938  0.0215821  -0.11558575  0.22197869 -0.20721128  0.00736811
  0.08806322  0.18042265 -0.04938702 -0.0888981  -0.23497856 -0.33896977
 -0.24926535 -0.09178259  0.10361074 -0.34336641 -0.38925059  1.94593889
 -0.39406113 -0.29846101  0.05983664 -0.12034024 -0.18222831  0.22929205
 -0.56743326 -0.16544122 -0.01229438]X


In [22]:
# Make predictions using the X set
predicted_y_values = model.predict(X)
predicted_y_values

array([ 2.83982912,  3.52649446,  2.85236746, ...,  0.70586767,
       -0.09896019,  0.22126265])

In [23]:
# Create a copy of the original data
predicted_sales_df = cleaned_df[['Year','Global_Sales']].copy()

# Add a column with the predicted sales values
predicted_sales_df["sales_predicted"] = predicted_y_values

# Display sample data
predicted_sales_df.head(20)

Unnamed: 0,Year,Global_Sales,sales_predicted
0,2006.0,82.74,2.839829
1,1985.0,40.24,3.526494
2,2008.0,35.82,2.852367
3,2009.0,33.0,2.714237
4,1996.0,31.37,2.932076
5,1989.0,30.26,2.929849
6,2006.0,30.01,2.837686
7,2006.0,29.02,2.813141
8,2009.0,28.62,3.025114
9,1984.0,28.31,3.526802


In [24]:
# Compute the metrics for the linear regression model
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print releveant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.1240443790397141.
The r2 is 0.1240443790397141.
The mean squared error is 2.15171316323733.
The root mean squared error is 1.4668718973507298.
The standard deviation is 1.5672964014323438.


In [None]:
# Add a new column that has the status
column_values = cleaned_df[(cleaned_df.Global_Sales >= 1.00)
                                       ]

final_df = cleaned_df.insert(column='Sales_Status', value=column_values)
final_df.head(5)

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Because the balanced accuracy score is 95%, the logistic regression model performs well. Having said that, this is more than likely due to the data that is imbalanced.
The number of healthy loans, which are low-risk, outweigh the number of non-healthy loans, which are high-risk. This means that the model will do a better job at predicting the loan status's as healthy rather than predicting the loan status's as non-healthy.

Looking at the imbalanced classification report, the model predicts healthy loans 100% of the time and non-healthy loans 85% of the time.

---

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The balanced accuracy score of the oversampler model is 99%, which is higher than the 95% of the imbalanced data. The oversampler model is better at identifying high risk loans. There are also a lot less fake positives.