In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import sys
import pandas

In [None]:
# load dataset as pandas dataframe
games=pandas.read_csv("games.csv");


In [None]:
print(games.columns)
#average_weight implies complexity of game

print(games.shape)

In [None]:
plt.hist(games['average_rating'])
plt.show()

In [None]:
#print first row of all games with 0 rating
print(games[games['average_rating']==0].iloc[0])

# Dataframe.iloc[] method is used when the index label of a data frame is something other than numeric series of 0,
#1, 2, 3….n or in case the user doesn’t know the index label. Rows can be extracted using an imaginary index 
#position which isn’t visible in the data frame.

#print first row of games with rating >0
print(games[games['average_rating']>0].iloc[0])


In [None]:
#remove rows from dataset without user review
games= games[games['users_rated']>0]

#remove any rows with empty values
games=games.dropna(axis=0)
#Sometimes csv file has null values, which are later displayed as NaN in Data Frame.
#Pandas dropna() method allows the user to analyze and drop Rows/Columns with Null values
#axis=0, or ‘index’ : Drop rows which contain missing values.
#axis=1, or ‘columns’ : Drop columns which contain missing value.
plt.hist(games['average_rating'])
plt.show()

In [None]:
#correlation matrix for correlation between attributes
corrmat=games.corr()
fig=plt.figure(figsize=(12,9))

sns.heatmap(corrmat,vmax=0.8,square=True)
plt.show()
#

In [None]:
#Get all the columns from the dataframe
columns=games.columns.tolist()

#Filter the columns to remove data we don't want
columns=[c for c in columns if c not in ["bayes_average_rating", "average_rating","type","name","id"]]
#Remove: average rating as it is what we want to predict
#        bayes_avg_rating as it is based on avg rating
#        type=boardgame for all games
#        name does not contribute useful info
#        id does not contribute useful info

#store the variable we'll be predicting on
target="average_rating"


In [None]:
#Generate traing and test dataset(80-20 ratio )
train=games.sample(frac=0.8,random_state=1)

#select anything not in the training set and put it in test
test=games.loc[~games.index.isin(train.index)]

print(train.shape)
print(test.shape)

# Using A Linear Training Model: Linear Regression

In [None]:
#Import linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#Initialize the model class
LR=LinearRegression()

#fit the model with the training data and thus optimize parameters
LR.fit(train[columns],train[target])



In [None]:
#Generate predictions for test set
predictions=LR.predict(test[columns])

#Compute error between test predictions and actual values
mean_squared_error(predictions,test[target])

# Using a Non-Linear Training Model: Random Forest Regressor Model

In [None]:
# import random forest model
from sklearn.ensemble import RandomForestRegressor

#Initialize the model
RFR=RandomForestRegressor(n_estimators=100,min_samples_leaf=10,random_state=1)

#fit the data
RFR.fit(train[columns],train[target])

predictions=RFR.predict(test[columns])

mean_squared_error(predictions,test[target])

In [None]:
#make predictions on a specific game from dataset
print(test[columns].iloc[0])

rating_LR=LR.predict(test[columns].iloc[0].values.reshape(1,-1)) #row vector to column vector
rating_RFR=RFR.predict(test[columns].iloc[0].values.reshape(1,-1))

print(rating_LR)
print(rating_RFR)
test[target].iloc[0]

print(test[columns].iloc[0].values.shape)

print(test[columns].iloc[0].values.reshape(1,-1).shape)

In [None]:
#make predictions on a specific game from dataset
print(test[columns].iloc[494])

rating_LR=LR.predict(test[columns].iloc[494].values.reshape(1,-1))
rating_RFR=RFR.predict(test[columns].iloc[494].values.reshape(1,-1))

print(rating_LR)
print(rating_RFR)
test[target].iloc[494]