<a href="https://kritikseth.github.io/redirect" target="_parent"><img src="https://raw.githack.com/kritikseth/kritikseth/master/redirect.svg" alt="Kritik Seth"/></a>

In [None]:
import pandas as pd
import numpy as np
from pandasql import sqldf

import math
from scipy import stats
from scipy.stats import geom
from sklearn.linear_model import LinearRegression
from statsmodels.stats.power import TTestIndPower, ttest_power

import re
from tqdm.notebook import tqdm

import seaborn as sns
from plotly import tools
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as py
import matplotlib.pyplot as plt
%matplotlib inline

import gower
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from category_encoders import TargetEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, roc_auc_score, classification_report, mean_absolute_error
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from kmodes.kprototypes import KPrototypes
from sklearn.cluster import KMeans, DBSCAN
from mpl_toolkits.mplot3d import Axes3D
from yellowbrick.classifier import ROCAUC

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category = DeprecationWarning)

R_STATE = 18714836 # random state

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

## Data Loading and Cleaning

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/Capstone/fps_videogames.csv')

In [None]:
rename = {'CpuName': 'CPU Name', 'CpuNumberOfCores': 'CPU Cores', 'CpuNumberOfThreads': 'CPU Threads',
          'CpuBaseClock': 'CPU Base Clock', 'CpuCacheL1': 'CPU Cache L1', 'CpuCacheL2': 'CPU Cache L2',
          'CpuCacheL3': 'CPU Cache L3', 'CpuDieSize': 'CPU Die Size', 'CpuFrequency': 'CPU Frequency',
          'CpuMultiplier': 'CPU Multiplier', 'CpuMultiplierUnlocked': 'CPU Multiplier Unlocked',
          'CpuProcessSize': 'CPU Process Size', 'CpuTDP': 'CPU TDP', 'CpuNumberOfTransistors': 'CPU Transistors',
          'CpuTurboClock': 'CPU Turbo Clock',
          'GpuName': 'GPU Name', 'GpuArchitecture': 'GPU Architecture', 'GpuBandwidth': 'GPU Bandwidth',
          'GpuBaseClock': 'GPU Base Clock', 'GpuBoostClock': 'GPU Boost Clock', '\'GpuBus': 'GPU Bus',
          'GpuNumberOfComputeUnits': 'GPU Compute Units', 'GpuDieSize': 'GPU Die Size', 'GpuDirectX': 'GPU Direct X',
          'GpuNumberOfExecutionUnits': 'GPU Execution Units', 'GpuFP32Performance': 'GPU FP32 Performance',
          'GpuMemoryBus': 'GPU Memory Bus', 'GpuMemorySize': 'GPU Memory Size', 'GpuMemoryType': 'GPU Memory Type',
          'GpuOpenCL': 'GPU Open CL', 'GpuOpenGL': 'GPU Open GL', 'GpuPixelRate': 'GPU Pixel Rate', 'GpuProcessSize': 'GPU Process Size',
          'GpuNumberOfROPs': 'GPU Number of ROPs', 'GpuShaderModel': 'GPU Shader Model', 'GpuNumberOfShadingUnits': 'GPU Shading Units',
          'GpuNumberOfTMUs': 'GPU TMUs', 'GpuTextureRate': 'GPU Texture Rate', 'GpuNumberOfTransistors': 'GPU Transistors', 'GpuVulkan': 'GPU Vulkan',
          'GameName': 'Game', 'GameResolution': 'Game Resolution', 'GameSetting': 'Game Settings'}

df.rename(columns=rename, inplace=True)

df['CPU Name'] = df['CPU Name'].apply(lambda x: x.replace('-', ' '))
df['CPU Brand'] = df['CPU Name'].apply(lambda x: x.split(' ')[0])

df['CPU Type'] = df['CPU Name'].apply(lambda x: re.findall('[a-zA-Z]+', x)[-1].upper() if x[-1].isalpha() else 'Normal')
df['CPU Name'] = df['CPU Name'].apply(lambda x: x.replace(re.findall('[a-zA-Z]+', x)[-1], '', -1).strip() if x[-1].isalpha() else x)

df['CPU Model'] = df['CPU Name']
df['CPU Model'] = df.apply(lambda x : x['CPU Model'].replace(str(x['CPU Brand']), '').strip(), axis=1)

cpu_series = ['A4', 'A6', 'Athlon', 'Athlon 64', 'Athlon II', 'FX', 'Ryzen', 'Core', 'Pentium']

for series in cpu_series:
    df['CPU Series Temp'] = df['CPU Model'].apply(lambda x: series if series in x else 'NA')
    ind = df[df['CPU Series Temp']==series].index
    df.loc[ind, 'CPU Series'] = df.loc[ind, 'CPU Series Temp']

df['CPU Model'] = df.apply(lambda x : x['CPU Model'].replace(str(x['CPU Series']), '').strip(), axis=1)
df.drop(['CPU Series Temp'], axis=1, inplace=True)

df['CPU Generation'] = df['CPU Model'].apply(lambda x: x.split(' ')[-1][1] if x.split(' ')[-1][0].isalpha() else x.split(' ')[-1][0]).tolist()

core_ind = df[df['CPU Series']=='Core'].index

df.loc[core_ind, 'CPU Generation'] = df.loc[core_ind, 'CPU Model'].apply(lambda x: x.split(' ')[-1][0]+'0' if len(x.split(' ')[-1])==3 else x.split(' ')[-1][0])
older_gen = {'90': 1, '80': 0, '70': -1, '60': -2, '50': -3, '40': -4, '(': None}
df['CPU Generation'] = df['CPU Generation'].replace(older_gen)

df['CPU Model'] = df.apply(lambda x : x['CPU Model'].replace(str(x['CPU Model'].split(' ')[-1]), '').strip(), axis=1)

df['CPU Model'] = df['CPU Model'].fillna('NA')
df['CPU Series'] = df['CPU Series'].fillna('NA')
df.replace({'?': None}, inplace=True)

df['GPU Transistors'] = df['GPU Transistors'].astype(float)
df['CPU Transistors'] = df['CPU Transistors'].astype(float)

df['GPU Die Size'] = df['GPU Die Size'].astype(float)
df['CPU Die Size'] = df['CPU Die Size'].astype(float)

In [None]:
cpu_cols = ['CPU Name', 'CPU Brand', 'CPU Model', 'CPU Series', 'CPU Generation', 'CPU Type', 'CPU Cores', 'CPU Threads',
            'CPU Base Clock', 'CPU Cache L1', 'CPU Cache L2', 'CPU Cache L3', 'CPU Die Size','CPU Frequency', 'CPU Multiplier',
            'CPU Multiplier Unlocked', 'CPU Process Size', 'CPU TDP', 'CPU Transistors', 'CPU Turbo Clock']

cpu_detail_cols = ['CPU Brand', 'CPU Model', 'CPU Series', 'CPU Generation']

gpu_cols = ['GPU Name', 'GPU Architecture', 'GPU Bandwidth', 'GPU Base Clock', 'GPU Boost Clock', 'GPU Bus', 'GPU Compute Units',
            'GPU Die Size', 'GPU Direct X', 'GPU Execution Units', 'GPU FP32 Performance', 'GPU Memory Bus', 'GPU Memory Size',
            'GPU Memory Type', 'GPU Open CL', 'GPU Open GL', 'GPU Pixel Rate', 'GPU Process Size', 'GPU Number of ROPs', 'GPU Shader Model',
            'GPU Shading Units', 'GPU TMUs', 'GPU Texture Rate', 'GPU Transistors', 'GPU Vulkan']

gpu_detail_cols = ['GPU Brand', 'GPU Model', 'GPU Series', 'GPU Generation']


cpu = df[cpu_cols]
gpu = df[gpu_cols]

In [None]:
# gpu['GPU Name Temp'] = gpu['GPU Name']
# gpu['GPU Brand'] = gpu['GPU Name Temp'].apply(lambda x: x.split(' ')[0])
# gpu['GPU Name Temp'] = gpu['GPU Name Temp'].apply(lambda x: ' '.join(x.split(' ')[1:]))

# gpu['GPU Model'] = gpu['GPU Name Temp'].apply(lambda x: x.split(' ')[0])
# gpu['GPU Name Temp'] = gpu['GPU Name Temp'].apply(lambda x: ' '.join(x.split(' ')[1:]))

# gpu['GPU Series'] = gpu['GPU Name Temp'].apply(lambda x: x.split(' ')[0])
# gpu['GPU Name Temp'] = gpu['GPU Name Temp'].apply(lambda x: ' '.join(x.split(' ')[1:]))

# gpu['GPU Generation'] = gpu['GPU Name Temp'].apply(lambda x: x.split(' ')[0])
# gpu['GPU Name Temp'] = gpu['GPU Name Temp'].apply(lambda x: ' '.join(x.split(' ')[1:]))

In [None]:
cpu_null_cols = ['CPU Cache L3', 'CPU Die Size', 'CPU Transistors']
gpu_null_cols = ['GPU Bandwidth', 'GPU Compute Units', 'GPU Compute Units', 'GPU Compute Units', 'GPU Die Size',
                 'GPU Execution Units', 'GPU FP32 Performance', 'GPU Memory Bus', 'GPU Memory Size', 'GPU Memory Type',
                 'GPU Open CL', 'GPU Shader Model', 'GPU Shading Units', 'GPU Transistors', 'GPU Vulkan']

## Exploratory Data Analysis

In [None]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0: 'Missing Values', 1: '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
    print(f'Your selected dataframe has {str(df.shape[1])} columns.\nThere are {str(mis_val_table_ren_columns.shape[0])} columns that have missing values.')
    return mis_val_table_ren_columns

In [None]:
df.head()

In [None]:
missing_values_table(df)

In [None]:
gpu1 = df[['GPU Process Size', 'GPU Transistors', 'GPU Die Size']]
gpu1.dropna(inplace=True)
cpu1 = df[['CPU Process Size', 'CPU Transistors', 'CPU Die Size']]
cpu1.dropna(inplace=True)

In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=cpu1['CPU Process Size'], y=cpu1['CPU Transistors'], z=cpu1['CPU Die Size'],
    mode='markers',
    marker=dict(
        size=cpu1['CPU Transistors'].apply(lambda x: x*0.006),
        color=cpu1['CPU Process Size'],
        colorscale='Viridis',
        opacity=0.8
    )
)])

fig.update_layout(title='(CPU) Process Size vs Die Size vs Transistors',
                  autosize=False,
                  width=800,
                  height=800,
                  scene = dict(
                      xaxis_title='Process Size',
                      yaxis_title='Transistors',
                      zaxis_title='Die Size'))
fig.show()

In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x=gpu1['GPU Process Size'], y=gpu1['GPU Transistors'], z=gpu1['GPU Die Size'],
    mode='markers',
    marker=dict(
        size=gpu1['GPU Transistors'].apply(lambda x: x*0.003),
        color=gpu1['GPU Process Size'],
        colorscale='Viridis',
        opacity=0.8
    )
)])

fig.update_layout(title='(GPU) Process Size vs Die Size vs Transistors',
                  autosize=False,
                  width=800,
                  height=800,
                  scene = dict(
                      xaxis_title='Process Size',
                      yaxis_title='Transistors',
                      zaxis_title='Die Size'))
fig.show('svg')

In [None]:
cpu2 = df[['CPU Brand', 'CPU Series', 'CPU Model', 'CPU Generation', 'CPU Type']]
cpu2['Count'] = 1

fig = px.sunburst(cpu2, path=['CPU Brand', 'CPU Series', 'CPU Model'], values='Count', title='Pie Chart of CPUs in our dataset',
            color='CPU Brand', color_continuous_scale='Viridis', color_continuous_midpoint=5)
fig.show()

In [None]:
game = df[['Game', 'Game Resolution', 'Game Settings']]
game['Count'] = 1

game_count = game.groupby(['Game']).agg('count').reset_index().drop_duplicates()
fig = px.bar(game_count, x='Game', y='Count')
fig.show()

In [None]:
game = df[['Game', 'Game Resolution', 'Game Settings']]
game['Count'] = 1

game_reso = game.groupby(['Game Resolution']).agg('count').reset_index().drop_duplicates()
fig = px.bar(game_reso, x='Game Resolution', y='Count')
fig.show()

In [None]:
game = df[['Game', 'Game Resolution', 'Game Settings']]
game['Count'] = 1

game_sett = game.groupby(['Game Settings']).agg('count').reset_index().drop_duplicates()
fig = px.bar(game_sett, x='Game Settings', y='Count')
fig.show()

In [None]:
corr = df.corr()

fig = go.Figure(data=go.Heatmap(z=corr, x=corr.columns, y=corr.columns,
                                xgap=1, ygap=1, colorscale='Viridis'),
                layout=go.Layout(title_text='Correlation Plot', height=1000,
                                 yaxis_autorange='reversed'))

fig.show()

## Inference

In [None]:
cpu_years = pd.read_excel('/content/gdrive/MyDrive/Capstone/cpu_names.xlsx')

In [None]:
cpu_detail_cols = ['CPU Brand', 'CPU Model', 'CPU Series', 'CPU Generation', 'CPU Cores']

cpu_years['CPU Name'] = cpu_years['CPU Name'].apply(lambda x:x.replace('-', ' '))
# cpu_years.drop(['month'], axis=1, inplace=True)
cpu = pd.merge(df[cpu_detail_cols + ['CPU Name', 'CPU Transistors']], cpu_years, how='left', on='CPU Name')
cpu = cpu[~cpu['CPU Transistors'].isna() & ~cpu['CPU Generation'].isna()] #dropping na transistors

cpu.drop(['Unnamed: 3'], axis=1, inplace=True)
numeric_cols = ['CPU Transistors', 'CPU Generation']
cpu[numeric_cols] = cpu[numeric_cols].apply(pd.to_numeric)

cpu.dropna(inplace = True)
cpu = cpu.drop_duplicates()
cpu['Adjusted Transistors'] = cpu['CPU Transistors']/cpu['CPU Cores']
cpu['Adjusted Transistors'] = cpu['Adjusted Transistors'].apply(int)

cpu.sort_values('CPU Release Year')

temp = cpu.copy()
new_cols = [x.replace(' ', '_') for x in cpu.columns]
print(new_cols)
temp.columns = new_cols
temp.head()

In [None]:
out = sqldf('''

    select max(Adjusted_Transistors) as Max_transistors, count(Adjusted_Transistors) as Models_released,
    avg(Adjusted_Transistors) as Avg_transistors,
    case when month < 1 then CPU_Release_Year - 1 
    when month > 9 then CPU_Release_Year + 1
    else CPU_Release_Year
    end as new_year
    from temp group by new_year

''')

In [None]:
out['Double_Transistors'] = 2 * out['Max_transistors']
out['Actual_Transistors'] = out['Max_transistors'].shift(-2)

In [None]:
out = out.replace(out.iloc[13][0], 1200) 
out = out.replace(out.iloc[6][0], 1303)
out = out.replace(out.iloc[7][0], 1303)
out = out.replace(out.iloc[11][0], 3100)

In [None]:
out

In [None]:
stat1, p1 = stats.ks_2samp(out['Actual_Transistors'][:-2], out['Double_Transistors'][:-2])

In [None]:
stat2, p2 = stats.mannwhitneyu(out['Actual_Transistors'][:-2], out['Double_Transistors'][:-2])

In [None]:
n1 = 12
n2 = 12
mu = (n1*n2)/2
sigma = math.sqrt((n1*n2)*(n1+n2+1)/12)
z = (stat2 - mu)/sigma
r = abs(z)/math.sqrt(n1)

In [None]:
r

In [None]:
kde_df = out[['Actual_Transistors', 'Double_Transistors']]

In [None]:
ax = kde_df.plot.kde()
plt.xlabel('Transistors')
plt.ylabel('Density')
plt.title('Comparison of Distribution Between Actual Transistors and Double Transistors')
plt.savefig('KDE Plot Of Distributions')

In [None]:
one = out['Double_Transistors'][:-2]
two = out['Actual_Transistors'][:-2]

In [None]:
X = out['new_year'][:-2]
X_axis = np.arange(len(X))
plt.bar(X_axis - 0.2, one, 0.4, label = 'Double_Transistors')
plt.bar(X_axis + 0.2, two, 0.4, label = 'Actual_Transistors')
plt.xticks(X_axis, X)
plt.xlabel('Year')
plt.ylabel('Number of Transistors')
plt.title('Comparison of Distribution Between Actual Transistors and Double Transistors')
plt.xticks(rotation = 45)
plt.legend()
plt.show()
plt.savefig('Bar Plot Of Distributions')

In [None]:
errorBar_df = out.copy()

In [None]:
errorBar_df['Errors'] = abs(errorBar_df['Double_Transistors'] - errorBar_df['Actual_Transistors'])

In [None]:
errorBar_df

In [None]:
sem_Actual = out['Actual_Transistors'].std() / math.sqrt(12)
sem_Actual

In [None]:
sem_Expected = out['Double_Transistors'].std() / math.sqrt(12)
sem_Expected

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))
ax.bar(errorBar_df['new_year'], errorBar_df['Actual_Transistors'],
       yerr=sem_Actual,
       align='center',
       alpha=0.5,
       ecolor='black',
       capsize=10)
plt.xlabel('Year')
plt.ylabel('Actual Number of Transistors')

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))
ax.bar(errorBar_df['new_year'], errorBar_df['Double_Transistors'],
       yerr=sem_Expected,
       align='center',
       alpha=0.5,
       ecolor='black',
       capsize=10)
plt.xlabel('Year')
plt.ylabel('Expected Number of Transistors')

## Data Imputation

In [None]:
based_col = ['CPU Brand', 'CPU Model', 'CPU Series', 'CPU Generation']

geometric_cols = ['CPU Transistors']
linear_cols = ['CPU Die Size']
step_cols = []

# Geometric Imputation
if geometric_cols:
    for impute_col in geometric_cols:
        temp = df[based_col + [impute_col]].dropna(inplace=False)
        temp[impute_col] = pd.to_numeric(temp[impute_col])
        index = df[df[impute_col].isnull()].index.tolist()

        value = temp.groupby(based_col).agg('mean').reset_index().to_dict('split')['data']
        value_keys = [val[:4] for val in value]
        for ind in tqdm(index):

            key = tuple(df.loc[ind, based_col].values)
            previous = [val for val in value if key[:3] == tuple(val[:3])]
            if not previous:
                df.drop(ind, axis=0, inplace=True)
                continue
            gen = [int(p[3]) for p in previous]
            impute = [int(p[-1]) for p in previous]    

            if list(key) in value_keys:
                df.loc[ind, impute_col] = [val for val in value if key[:4] == tuple(val[:4])][0][-1]

            else:
                if int(key[3]) > min(gen) and int(key[3]) < max(gen):
                    geom = np.geomspace(impute[gen.index(min(gen))],
                                        impute[gen.index(max(gen))],
                                        max(gen) - min(gen) + 1)
                    df.loc[ind, impute_col] = geom[int(key[-1])-min(gen)]

                if int(key[3]) < min(gen):
                    gap = min(gen) - int(key[3])
                    val = impute[gen.index(min(gen))]
                    for i in range(gap): val /= 2
                    df.loc[ind, impute_col] = val
                
                else:
                    gap = int(key[3]) - max(gen)
                    val = impute[gen.index(min(gen))]
                    for i in range(gap): val *= 2
                    df.loc[ind, impute_col] = val


# Linear Imputation
if linear_cols:
    for impute_col in linear_cols:
        temp = df[based_col + [impute_col]].dropna(inplace=False)
        temp[impute_col] = pd.to_numeric(temp[impute_col])
        index = df[df[impute_col].isnull()].index.tolist()

        value = temp.groupby(based_col).agg('mean').reset_index().to_dict('split')['data']
        value_keys = [val[:4] for val in value]
        for ind in tqdm(index):

            key = tuple(df.loc[ind, based_col].values)
            previous = [val for val in value if key[:3] == tuple(val[:3])]
            if not previous:
                df.drop(ind, axis=0, inplace=True)
                continue
            gen = [int(p[3]) for p in previous]
            impute = [int(p[-1]) for p in previous]    

            if list(key) in value_keys:
                df.loc[ind, impute_col] = [val for val in value if key[:4] == tuple(val[:4])][0][-1]

            else:
                lr = LinearRegression()
                lr.fit(np.array(gen).reshape(-1, 1), np.array(impute).reshape(-1, 1))
                df.loc[ind, impute_col] = lr.predict([[7]])[0][0]


# Step Imputation
if step_cols:
    for impute_col in step_cols:
        temp = df[based_col + [impute_col]].dropna(inplace=False)
        temp[impute_col] = pd.to_numeric(temp[impute_col])
        index = df[df[impute_col].isnull()].index.tolist()

        value = temp.groupby(based_col).agg('mean').reset_index().to_dict('split')['data']
        value_keys = [val[:4] for val in value]
        for ind in tqdm(index):

            key = tuple(df.loc[ind, based_col].values)
            previous = [val for val in value if key[:3] == tuple(val[:3])]
            if not previous:
                df.drop(ind, axis=0, inplace=True)
                continue
            gen = [int(p[3]) for p in previous]
            impute = [int(p[-1]) for p in previous]    

            if list(key) in value_keys:
                df.loc[ind, impute_col] = [val for val in value if key[:4] == tuple(val[:4])][0][-1]

            else:
                steps = np.linspace(impute[gen.index(min(gen))],
                                    impute[gen.index(max(gen))],
                                    max(gen) - min(gen) + 1)
                step = step[1] - step[0]
                if int(key[3]) > min(gen) and int(key[3]) < max(gen):
                    df.loc[ind, impute_col] = steps[int(key[-1])-min(gen)]

                if int(key[3]) < min(gen):
                    gap = min(gen) - int(key[3])
                    val = impute[gen.index(min(gen))]
                    for i in range(gap): val -= step
                    df.loc[ind, impute_col] = val
                
                else:
                    gap = int(key[3]) - max(gen)
                    val = impute[gen.index(min(gen))]
                    for i in range(gap): val += step
                    df.loc[ind, impute_col] = val

In [None]:
 missing_values_table(df)

## Regression

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/Capstone/transformed_fps.csv')

In [None]:
drop_cols = ['id', 'CPU Name', 'GPU Name','GPU Open GL','CPU Model','Dataset']
label_cols = ['CPU Brand', 'CPU Series',
              'CPU Type', 'CPU Multiplier Unlocked',
              'GPU Architecture', 'GPU Bus', 'GPU Memory Type',
              'GPU Open CL','GPU Shader Model', 'GPU Vulkan', 'Game', 'GPU Direct X', 'Game Settings']

auto_ordinal_cols = ['Game Resolution', 'CPU Base Clock'] 
df = df.drop(drop_cols, axis=1)

### Mean Encoding

In [None]:
for label in label_cols:
  mean = df['FPS'].mean()
  agg = df.groupby(label)['FPS'].agg(['count', 'mean'])
  counts = agg['count']
  means = agg['mean']
  weight = 100
  smooth = (counts*means+weight*mean)/(counts+weight)
  df.loc[:,label] = df[label].map(smooth)

In [None]:
df.shape

### Random forest

In [None]:
rfr = RandomForestRegressor(n_estimators=200, criterion='mse', 
                            max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                            max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                            bootstrap=True, oob_score=False, n_jobs=-1, 
                            random_state=R_STATE, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
df1=df.drop(['FPS'],axis=1)                           
X = df1
y = df['FPS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=R_STATE)
rfr.fit(X_train, y_train) 
prediction = rfr.predict(X_test)
mse = mean_squared_error(y_test, prediction)

print(f'MSE is {mse}')
print(f'RMSE is {mse ** 0.5}')
print(f'R2 score is {r2_score(y_test, prediction)}')

### Scatter Plot

In [None]:
# plt.scatter(prediction, prediction - y_test, c='green', marker='s', label='Test data')
# plt.xlabel('Predicted values')
# plt.ylabel('Residuals')
# plt.legend(loc='upper left')
# plt.hlines(y=0, xmin=0, xmax=1000, lw=2, color='red')
# plt.xlim([0, 1000])
# plt.tight_layout()
# plt.show()

### Feature importances

In [None]:
def plot_feature_importance(importance, names, model_type):
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'][:20])
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

plot_feature_importance(rfr.feature_importances_,X_train.columns,'RANDOM FOREST')

In [None]:
df = df.sort_values('Game Settings')
df['Game Settings'].value_counts()

In [None]:
df_low = df[:64573]
df_med = df[64573:110186]
df_high = df[110186:314478]
df_max = df[314478:]

In [None]:
rfr = RandomForestRegressor(n_estimators=200, criterion='mse', 
                            max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                            max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                            bootstrap=True, oob_score=False, n_jobs=-1, 
                            random_state=R_STATE, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)

df1 = df_low.drop(['FPS'],axis=1)                           
X = df1
y = df_low['FPS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=R_STATE)
rfr.fit(X_train, y_train) 
plot_feature_importance(rfr.feature_importances_,X_train.columns,'RANDOM FOREST low game settings')

In [None]:
df1 = df_med.drop(['FPS'],axis=1)                           
X = df1
y = df_med['FPS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=R_STATE)
rfr.fit(X_train, y_train) 
plot_feature_importance(rfr.feature_importances_,X_train.columns,'RANDOM FOREST med game settings')

In [None]:
df1 = df_high.drop(['FPS'],axis=1)                           
X = df1
y = df_high['FPS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=R_STATE)
rfr.fit(X_train, y_train) 
plot_feature_importance(rfr.feature_importances_,X_train.columns,'RANDOM FOREST high game settings')

In [None]:
df1 = df_max.drop(['FPS'],axis=1)                           
X = df1
y = df_max['FPS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=R_STATE)
rfr.fit(X_train, y_train) 
plot_feature_importance(rfr.feature_importances_,X_train.columns,'RANDOM FOREST max game settings')

### PCA

In [None]:
df_= df.drop(['FPS'],axis=1)                           
X_train, X_test, y_train, y_test = train_test_split(df_, df['FPS'], test_size = 0.30, random_state=R_STATE)
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
pca_test = PCA(n_components=42)
pca_test.fit(X_train_scaled)
sns.set(style='whitegrid')
plt.plot(np.cumsum(pca_test.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.axvline(linewidth=4, color='r', linestyle = '--', x=15, ymin=0, ymax=1)
display(plt.show())
evr = pca_test.explained_variance_ratio_
cvr = np.cumsum(pca_test.explained_variance_ratio_)
pca_df = pd.DataFrame()
pca_df['Cumulative Variance Ratio'] = cvr
pca_df['Explained Variance Ratio'] = evr
display(pca_df.head(10))

In [None]:
pca = PCA(n_components=15)
pca.fit(X_train_scaled)
X_train_scaled_pca = pca.transform(X_train_scaled)
X_test_scaled_pca = pca.transform(X_test_scaled)

In [None]:
pca_dims = []
for x in range(0, len(pca_df)):
    pca_dims.append('PCA Component {}'.format(x))
pca_test_df = pd.DataFrame(pca_test.components_, columns=df_.columns, index=pca_dims)
pca_test_df.head(15).T

In [None]:
rfr.fit(X_train_scaled_pca, y_train)
prediction = rfr.predict(X_test_scaled_pca)

mse = mean_squared_error(y_test, prediction)
print(f'MSE is {mse}')
print(f'RMSE is {mse**0.5}')
print(f'R2 score is {r2_score(y_test, prediction)}')

In [None]:
X_pca_big, X_pca_small, y_big, y_small = train_test_split(X_train_scaled_pca, y_train, test_size = 0.5, random_state=R_STATE)

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(start = 1, stop = 15, num = 15)]
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
bootstrap = [True, False]
param_dist = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
rs = RandomizedSearchCV(rfr, 
                        param_dist, 
                        n_iter = 10, 
                        cv = 3, 
                        verbose = 1, 
                        n_jobs=-1, 
                        random_state=R_STATE)
rs.fit(X_pca_small, y_small)
rs.best_params_
# {'n_estimators': 300,
# 'min_samples_split': 12,
# 'min_samples_leaf': 23,
# 'max_features': 'log2',
# 'max_depth': 14,
# 'bootstrap': False}

In [None]:
rs_df = pd.DataFrame(rs.cv_results_).sort_values('rank_test_score').reset_index(drop=True)
rs_df = rs_df.drop([
            'mean_fit_time', 
            'std_fit_time', 
            'mean_score_time',
            'std_score_time', 
            'params', 
            'split0_test_score', 
            'split1_test_score', 
            'split2_test_score', 
            'std_test_score'],
            axis=1)
rs_df.head(10)

In [None]:
fig, axs = plt.subplots(ncols=3, nrows=2)
sns.set(style="whitegrid", color_codes=True, font_scale = 2)
fig.set_size_inches(30,25)
sns.barplot(x='param_n_estimators', y='mean_test_score', data=rs_df, ax=axs[0,0], color='lightgrey')
# axs[0,0].set_ylim([.83,.93])
axs[0,0].set_title(label = 'n_estimators', size=30, weight='bold')
sns.barplot(x='param_min_samples_split', y='mean_test_score', data=rs_df, ax=axs[0,1], color='coral')
# axs[0,1].set_ylim([.85,.93])
axs[0,1].set_title(label = 'min_samples_split', size=30, weight='bold')
sns.barplot(x='param_min_samples_leaf', y='mean_test_score', data=rs_df, ax=axs[0,2], color='lightgreen')
# axs[0,2].set_ylim([.80,.93])
axs[0,2].set_title(label = 'min_samples_leaf', size=30, weight='bold')
sns.barplot(x='param_max_features', y='mean_test_score', data=rs_df, ax=axs[1,0], color='wheat')
# axs[1,0].set_ylim([.88,.92])
axs[1,0].set_title(label = 'max_features', size=30, weight='bold')
sns.barplot(x='param_max_depth', y='mean_test_score', data=rs_df, ax=axs[1,1], color='lightpink')
# axs[1,1].set_ylim([.80,.93])
axs[1,1].set_title(label = 'max_depth', size=30, weight='bold')
sns.barplot(x='param_bootstrap',y='mean_test_score', data=rs_df, ax=axs[1,2], color='skyblue')
# axs[1,2].set_ylim([.88,.92])
axs[1,2].set_title(label = 'bootstrap', size=30, weight='bold')
plt.show()

In [None]:
rfr = RandomForestRegressor(n_estimators=200, criterion='mse', 
                            max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                            max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                            bootstrap=True, oob_score=False, n_jobs=-1, 
                            random_state=R_STATE, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
rfr.fit(X_train_scaled_pca, y_train)
# y_pred = rfr.predict(X_test_scaled_pca)
# y_pred_pca = rfr.predict(X_test_scaled_pca)
y_pred_gs = rs.best_estimator_.predict(X_test_scaled_pca)

In [None]:
mse = mean_squared_error(y_test, y_pred_gs)
print(f'MSE is {mse}')
print(f'RMSE is {mse**0.5}')
print(f'R2 score is {r2_score(y_test, y_pred_gs)}')

### ANN

In [None]:
X = df
y = df['FPS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=R_STATE) #20 epochs and 32 batch_size gives the best results

In [None]:
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = df.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

In [None]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [None]:
history = NN_model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split = 0.2, callbacks=callbacks_list)

In [None]:
def plot_history(history, key):
    plt.plot(history.history[key])
    plt.plot(history.history['val_'+key])
    plt.xlabel("Epochs")
    plt.ylabel(key)
    plt.legend([key, 'val_'+key])
    plt.show()
# Plot the history
plot_history(history, 'mean_absolute_error')

In [None]:
predictions = NN_model.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)

In [None]:
print(mae)
print(mse**0.5)

In [None]:
r2_score(y_test, predictions)

## Clustering

In [None]:
# 3d scatterplot using plotly
Scene = dict(xaxis = dict(title  = 'Resolution'),yaxis = dict(title  = 'FPS'),zaxis = dict(title  = 'Settings'))
labels = kproto.labels_
trace = go.Scatter3d(x=mark_one[:, 0], y=mark_one[:, 1], z=mark_one[:, 2], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(margin=dict(l=0,r=0),scene = Scene,height = 800,width = 800)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()

#### DBSCAN

In [None]:
big_part, small_part = train_test_split(full_data, test_size=0.1, random_state=R_STATE)
features_db = small_part.drop(['Game Resolution', 'FPS', 'Game Settings'], axis = 1)
target_db = small_part[['Game Resolution', 'FPS', 'Game Settings']]

In [None]:
lis = []
window_size = 10000
for i in range(len(full_data)//window_size):
  print(i)
  db = full_data[i*window_size:(i+1)*window_size]
  print(db.shape)
  features_db = db.drop(['Game Resolution', 'FPS', 'Game Settings'], axis = 1)
  target_db = db[['Game Resolution', 'FPS', 'Game Settings']]
  features_db_dist_matrix = gower.gower_matrix(features_db)
  a = pd.DataFrame(features_db_dist_matrix)
  dbs = DBSCAN(eps = 0.8, min_samples = 10).fit(a)
  b = dbs.labels_
  lis += list(b)
  print(f'{i+1} out of 17 done')

In [None]:
full_data = full_data.head(180000)
full_data['clusters'] = lis

In [None]:
# db_data = full_data.to_csv('/content/gdrive/MyDrive/Capstone/full_data.csv')

In [None]:
features_db = full_data.drop(['Game Resolution', 'FPS', 'Game Settings'], axis = 1)
target_db = full_data[['Game Resolution', 'FPS', 'Game Settings']]

In [None]:
mark_two = target_db.values

In [None]:
# 3d scatterplot using plotly
Scene = dict(xaxis = dict(title  = 'Resolution'),yaxis = dict(title  = 'FPS'),zaxis = dict(title  = 'Settings'))
labels = full_data['clusters']
trace = go.Scatter3d(x=mark_two[:, 0], y=mark_two[:, 1], z=mark_two[:, 2], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(margin=dict(l=0,r=0),scene = Scene,height = 800,width = 800)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()

In [None]:
# un = []
# eps_values = np.abs(np.linspace(0,1,100))
# min_values = np.linspace(1,21,20).astype(int)
# for i,eps in enumerate(eps_values[1:-1]):
#   print(f'{i+1}')
#   for min in min_values:
#     dbs = DBSCAN(eps = eps, min_samples = min)
#     dbs.fit(a)
#     b = dbs.labels_
#     unique, counts = np.unique(b, return_counts=True)
#     if len(unique)<8 and len(unique)>4:
#       un.append([eps,min,counts])

## Classification

In [None]:
df = pd.read_csv('transformed_fps.csv')

In [None]:
df.shape

In [None]:
drop_cols = ['id', 'CPU Name', 'GPU Name','GPU Open GL','CPU Model','Dataset'] #GPU Open GL has only one value
label_cols = ['CPU Brand', 'CPU Series',
              'CPU Type', 'CPU Multiplier Unlocked',
              'GPU Architecture', 'GPU Bus', 'GPU Memory Type',
              'GPU Open CL','GPU Shader Model', 'GPU Vulkan', 'Game',
              ]

auto_ordinal_cols = ['Game Resolution', 'CPU Base Clock', 'GPU Direct X'] # Questionable: CPU Base Clock, 

#df['CPU Model'] = df['CPU Model'].replace({'X2': 0, '3': 1, 'i3': 1, '5': 2, 'i5': 2, '7': 3, 'i7': 3, '9': 4, 'Threadripper': 5})

df = df.drop(drop_cols, axis=1)



In [None]:
df = pd.get_dummies(df, columns=label_cols)

In [None]:
df

In [None]:
enc_dict_resolution = {720:0,1080:1,1440:2}
df['Game Resolution'] = df['Game Resolution'].map(enc_dict_resolution)

enc_dict_CPU_base_clock = {100:0,133:1,200:2}
df['CPU Base Clock'] = df['CPU Base Clock'].map(enc_dict_CPU_base_clock)

df['GPU Direct X'] = df['GPU Direct X'].astype(str)
enc_dict_direct = {'12':0,'12 Ultimate':1}
df['GPU Direct X'] = df['GPU Direct X'].map(enc_dict_direct)

In [None]:
def plot_ROC_curve(model, xtrain, ytrain, xtest, ytest):

    # Creating visualization with the readable labels
    visualizer = ROCAUC(model)
                                        
    # Fitting to the training data first then scoring with the test data                                    
    visualizer.fit(xtrain, ytrain)
    visualizer.score(xtest, ytest)
    visualizer.show()
    
    return visualizer

def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):
    
    #creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        
        #creating a list of all the classes except the current class 
        other_class = [x for x in unique_class if x != per_class]

        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc

    return roc_auc_dict

In [None]:
#full dataset
rfc = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=R_STATE, verbose=0,
                      warm_start=False)
df1=df.drop("Game Settings",axis=1)
X = df1
y = df['Game Settings']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=R_STATE)
rfc.fit(X_train, y_train) 

y_pred=rfc.predict(X_test)
target_names = ['low', 'med', 'high','max']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
plot_ROC_curve(rfc, X_train, y_train, X_test, y_test)

In [None]:
#with 25 components
rfc = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=R_STATE, verbose=0,
                      warm_start=False)
df1=df.drop("Game Settings",axis=1)
X = df1
y = df['Game Settings']
steps = [('pca', PCA(n_components=25)), ('m', rfc)]
model = Pipeline(steps=steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=R_STATE)
model.fit(X_train, y_train) 

y_pred=model.predict(X_test)
target_names = ['low', 'med', 'high','max']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
plot_ROC_curve(model, X_train, y_train, X_test, y_test)

In [None]:
#with 30 components
rfc = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=R_STATE, verbose=0,
                      warm_start=False)
df1=df.drop("Game Settings",axis=1)
X = df1
y = df['Game Settings']
steps = [('pca', PCA(n_components=30)), ('m', rfc)]
model = Pipeline(steps=steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=R_STATE)
model.fit(X_train, y_train) 

y_pred=model.predict(X_test)
target_names = ['low', 'med', 'high','max']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
plot_ROC_curve(model, X_train, y_train, X_test, y_test)

In [None]:
#lightgbm full dataset
df1=df.drop("Game Settings",axis=1)
X = df1
y = df['Game Settings']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=R_STATE)
model = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=R_STATE)
model.fit(X_train,y_train)

y_pred=model.predict(X_test)
target_names = ['low', 'med', 'high','max']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
plot_ROC_curve(model, X_train, y_train, X_test, y_test)

In [None]:
#25 components

lgbm_full = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=R_STATE)
df1=df.drop("Game Settings",axis=1)
X = df1
y = df['Game Settings']
steps = [('pca', PCA(n_components=25)), ('m', lgbm_full)]
model = Pipeline(steps=steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=R_STATE)
model.fit(X_train, y_train) 

y_pred=model.predict(X_test)
target_names = ['low', 'med', 'high','max']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
plot_ROC_curve(model, X_train, y_train, X_test, y_test)

In [None]:
#30 components

lgbm_full = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=R_STATE)
df1=df.drop("Game Settings",axis=1)
X = df1
y = df['Game Settings']
steps = [('pca', PCA(n_components=30)), ('m', lgbm_full)]
model = Pipeline(steps=steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=R_STATE)
model.fit(X_train, y_train) 

y_pred=model.predict(X_test)
target_names = ['low', 'med', 'high','max']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
plot_ROC_curve(model, X_train, y_train, X_test, y_test)

<a href="https://kritikseth.github.io/redirect" target="_parent"><img src="https://raw.githack.com/kritikseth/kritikseth/master/redirect.svg" alt="Kritik Seth"/></a>