In [1]:
import csv
def prepare_datasets(file_path):
    """ 
    Accepts: path to a tab-separated plaintext file
    Returns: a list containing a dictionary for every row in the file, 
        with the file column headers as keys
    """
    
    with open(file_path) as infile:
        reader = csv.DictReader(infile, delimiter=',')
        list_of_dicts = [dict(r) for r in reader]
        
    return list_of_dicts

In [2]:
otome_games = prepare_datasets("csvfiles/OtomeGames.csv")
print(otome_games[0])

{'\ufeffName': 'アンジェリークSpecial2', 'Year': '1998', 'Date': '96/12/06', 'Copies1stWeek': '14024', 'CopiesTotal': '14024', 'Platform': 'PS', 'Company': 'コーエー', 'CERO': '-', 'NoLI': '10', 'NoFemale': '2', 'NoFemaleLI': '0', 'NoFemaleFI': '0', 'NoLGBT': '0', 'NoCDM': '0', 'NoCDF': '0', '': '', 'Memo': '森村蘭（黑龙神子 无END 类反派）'}
{'\ufeffName': 'アンジェリーク・コレット', 'Game': 'アンジェリーク', 'Year': '1994', 'Release Date': '94/9/23', 'Race': '1', 'Age': '2', 'IsLI': '0', 'IsFI': '0', 'IsSingle': '1', 'IsHeroine': '1', 'IsV': '0'}


In [3]:
import pandas as pd
import numpy as np
games_df = pd.DataFrame(otome_games)

In [4]:
rel_games_df = games_df[['Year', 'Copies1stWeek', 'CopiesTotal', 'NoLI', 'NoFemale', 'NoFemaleLI',
                         'NoFemaleLI', 'NoFemaleFI', 'NoLGBT', 'NoCDM', 'NoCDF']]
rel_games_df.replace('', np.nan, inplace=True)

firstweek_df = rel_games_df.drop("CopiesTotal", axis=1)
totalsales_df = rel_games_df.drop("Copies1stWeek", axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rel_games_df.replace('', np.nan, inplace=True)


## Splitting the data into training set and test set for 1st week sales

First we'll copy the dataset to predict the total copies that are sold. Since there are 

In [5]:
from sklearn.preprocessing import StandardScaler

# Split data into features and label 
firstweek_df.dropna(subset=['Copies1stWeek'], inplace=True)
X = firstweek_df.copy()
y = firstweek_df["Copies1stWeek"].copy() 

# Instantiate scaler and fit on features
scaler = StandardScaler()
scaler.fit(X)

# Transform features
X_scaled = scaler.transform(X.values)

# View first instance
print(X_scaled[0])


[-3.04003058  0.24961607  1.23023437 -0.67061669 -0.09622504 -0.09622504
 -0.36153201 -0.18405254 -0.19518001  0.        ]




In [6]:
from sklearn.model_selection import train_test_split

# Split data into train and test
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled,
                                                                  y,
                                                             train_size=.7,
                                                           random_state=25)

# Check the splits are correct
print(f"Train size: {round(len(X_train_scaled) / len(X) * 100)}% \n\
Test size: {round(len(X_test_scaled) / len(X) * 100)}%")

Train size: 70% 
Test size: 30%


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Instnatiating the models 
logistic_regression = LogisticRegression()
svm = SVC()
tree = DecisionTreeClassifier()

# Training the models 
logistic_regression.fit(X_train_scaled, y_train)
svm.fit(X_train_scaled, y_train)
tree.fit(X_train_scaled, y_train)

# Making predictions with each model
log_reg_preds = logistic_regression.predict(X_test_scaled)
svm_preds = svm.predict(X_test_scaled)
tree_preds = tree.predict(X_test_scaled)

In [8]:
from sklearn.metrics import classification_report

# Store model predictions in a dictionary
# this makes it's easier to iterate through each model
# and print the results. 
model_preds = {
    "Logistic Regression": log_reg_preds,
    "Support Vector Machine": svm_preds,
    "Decision Tree": tree_preds
}

for model, preds in model_preds.items():
    print(f"{model} Results:\n{classification_report(y_test, preds)}", sep="\n\n")

Logistic Regression Results:
              precision    recall  f1-score   support

       10300       0.00      0.00      0.00       0.0
        1080       0.00      0.00      0.00       0.0
       10912       0.00      0.00      0.00       1.0
       11297       0.00      0.00      0.00       0.0
       11694       0.00      0.00      0.00       1.0
       11771       0.00      0.00      0.00       1.0
       12322       0.00      0.00      0.00       1.0
       12471       0.00      0.00      0.00       1.0
       12742       0.00      0.00      0.00       1.0
       12843       0.00      0.00      0.00       1.0
       13601       0.00      0.00      0.00       1.0
       13741       0.00      0.00      0.00       1.0
       13813       0.00      0.00      0.00       1.0
       14024       0.00      0.00      0.00       1.0
       14388       0.00      0.00      0.00       1.0
       17252       0.00      0.00      0.00       0.0
       17521       0.00      0.00      0.00       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Apply model to the Total Copies column instead

In [9]:
# Split data into features and label 
totalsales_df.dropna(subset=['CopiesTotal'], inplace=True)
X = totalsales_df.copy()
y = totalsales_df["CopiesTotal"].copy() 

# Instantiate scaler and fit on features
scaler = StandardScaler()
scaler.fit(X)

# Transform features
X_scaled = scaler.transform(X.values)

# View first instance
print(X_scaled[0])

# Split data into train and test
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled,
                                                                  y,
                                                             train_size=.7,
                                                           random_state=25)

# Check the splits are correct
print(f"Train size: {round(len(X_train_scaled) / len(X) * 100)}% \n\
Test size: {round(len(X_test_scaled) / len(X) * 100)}%")

# Instnatiating the models 
logistic_regression = LogisticRegression()
svm = SVC()
tree = DecisionTreeClassifier()

# Training the models 
logistic_regression.fit(X_train_scaled, y_train)
svm.fit(X_train_scaled, y_train)
tree.fit(X_train_scaled, y_train)

# Making predictions with each model
log_reg_preds = logistic_regression.predict(X_test_scaled)
svm_preds = svm.predict(X_test_scaled)
tree_preds = tree.predict(X_test_scaled)

# Store model predictions in a dictionary
# this makes it's easier to iterate through each model
# and print the results. 
model_preds = {
    "Logistic Regression": log_reg_preds,
    "Support Vector Machine": svm_preds,
    "Decision Tree": tree_preds
}

for model, preds in model_preds.items():
    print(f"{model} Results:\n{classification_report(y_test, preds)}", sep="\n\n")

[-3.00543351 -0.27391817  1.13343368 -0.62280558 -0.09853293 -0.09853293
 -0.36539703 -0.16222142 -0.1723455   0.        ]
Train size: 69% 
Test size: 31%
Logistic Regression Results:
              precision    recall  f1-score   support

        1080       0.00      0.00      0.00       0.0
       11320       0.00      0.00      0.00       1.0
       12290       0.00      0.00      0.00       0.0
       12317       0.00      0.00      0.00       0.0
       12430       0.00      0.00      0.00       0.0
        1263       0.00      0.00      0.00       1.0
       14024       0.00      0.00      0.00       1.0
       14062       0.00      0.00      0.00       1.0
       14344       0.00      0.00      0.00       1.0
       15236       0.00      0.00      0.00       1.0
       15519       0.00      0.00      0.00       1.0
       15736       0.00      0.00      0.00       1.0
       15746       0.00      0.00      0.00       1.0
       15796       0.00      0.00      0.00       1.0
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
