In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from collections import Counter
import math
from statistics import *
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

### Principal Component Analysis (PCA)
We can determine the most important features of this dataset using PCA. We can trade accuracy for 
simplicity as analyzing data with fewer dimensions is much easier and faster for machine learning
algorithms

Lets do a PCA analysis on a merged dataset combining title.ratings and title.basics as it contains 5 **useful** features out of 11

title.ratings.csv preview

In [None]:
titleRatingsDf = pd.read_csv('dataset/originalDataset/title.ratings.csv', sep='\t', low_memory=False)
titleRatingsDf.head(5)

titlet.basics.csv preview

In [None]:
titleBasicsDf = pd.read_csv('dataset/originalDataset/title.basics.csv', sep='\t', low_memory=False)
titleBasicsDf.head(5)

Merge and Remove Null Entries

In [None]:
inner_merged = pd.merge(titleRatingsDf, titleBasicsDf, on=["tconst"])
inner_merged = inner_merged[['averageRating', 'numVotes', 'isAdult', 'startYear', 'runtimeMinutes']]

# Remove Null Entries
def removeNa(df):
    to_nan = {
        "": np.nan,
        " ": np.nan,
        '\\N': np.nan
    }
    df.replace(to_nan, inplace=True)
    df = df.dropna()
    return df

main_df = removeNa(inner_merged)
main_df = main_df[['averageRating', 'numVotes', 'startYear', 'runtimeMinutes']] # remove isAdult, binary not good for model
# main_df.to_csv('Results/ModelData.csv', index=False, header=['averageRating', 'numVotes', 'startYear', 'runtimeMinutes'])
main_df.head(10)

Preprocess, scale and standardize the data before doing PCA on it


In [None]:
scaled_data = preprocessing.scale(main_df)
pca = PCA()
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)

per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]

# Viusalize scree plot
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=labels, edgecolor='black')
plt.xlabel("Princple Component")
plt.ylabel("Percentage Explained Variance")
plt.title("Scree Plot")
# plt.savefig('Results/Scree Plot.png', bbox_inches='tight')
plt.show()

Print out the PCA Graph by using the 2 most significant PCA's

In [None]:
pca_df = pd.DataFrame(pca_data, index=pca_data.T[0], columns=labels)
plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title('PCA Graph')
plt.xlabel(['PC1 - {0}%'.format(per_var[0])])
plt.ylabel(['PC2 - {0}%'.format(per_var[1])])

df1 = pca_df[['PC1', 'PC2']]
# plt.savefig('Results/pca_graph.png', bbox_inches='tight')
plt.show()

### Analysis
- Using the information from the scree plot and the plotted PCA Graph of the two most significant PCA’s, the percentage of variation explained by PC1 and PC2 was 24.1% and 20.6% respectively. 
- Looking at the data above it is clear that PC1 and PC2 are not sufficient enough to explain most of the variance in the data. 
- This highlights that dimensionality reduction on this dataset of 5 features is not viable as there is no one dominating PCA. 
- Also, reducing the number of features is not possible

### Classification
(1) Decision Tree Classifier


In [None]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [None]:
df_filtered = pd.read_csv('Results/ModelData.csv', low_memory=False)
df_sample = df_filtered.sample(100000)

Use a sample of 100,000 objects and calculate mahalanobis distance.

In [None]:
# Mahalanobis Distance
from scipy.spatial.distance import cdist
import numpy as np

# Find the mean for each attribute
df_mahalonobis = df_sample

data_mahala = df_mahalonobis.to_numpy().astype(float)
mean = np.mean(data_mahala, axis=0)

def calculateMahalanobis(x, mean, data):
    diff_data_mean = x - mean

    data_mahala = np.transpose(data)
    covM = np.cov(data_mahala, bias=False)
    invCovM = np.linalg.inv(covM)

    tem1 = np.dot(diff_data_mean, invCovM)
    tem2 = np.dot(tem1, np.transpose(diff_data_mean))

    m_distance = np.sqrt(tem2)

    return m_distance

mahala_distances = []

for x in data_mahala:
    distance = calculateMahalanobis(x, mean, data_mahala)
    mahala_distances.append(round(distance,3))
    # print(np.reshape(distance,-1))

print(mahala_distances[:100])

Visualize the distances to see if there is any patterns. Try to split into 3 distinct classes

In [None]:
plt.plot(mahala_distances)
plt.show()

In [None]:
# std = statistics.pstdev(euclidean_distances)
# mean_distance = statistics.mean(euclidean_distances)
labels = []
m = max(mahala_distances)

for distance in mahala_distances:
    if 0 <= distance <= 1:
        labels.append(1)
    elif 1 < distance <= 1.5 :
        labels.append(2)
    else:
        labels.append(3)

print(len(labels))

In [None]:
# Mahalonobis
import pandas as pd
from matplotlib import pyplot as plt
import statistics

plt.style.use('fivethirtyeight')

# the log parameter shows the the data in a semi-log scale to see frequency of groups with smaller counts
plt.hist(labels,  edgecolor='black', log=True)

median_distance = statistics.mean(labels)
color = '#fc4f30'

std = statistics.pstdev(labels)

# Adds a median line allowing us to see infer information about the data
plt.axvline(median_distance, color=color, label=f'Median Distance {round(median_distance,2)}', linewidth=2)
plt.axvline(median_distance + std, color='blue', label=f'Standard Deviation {round(median_distance + std,2)}', linewidth=2)
plt.axvline(median_distance + 2*std, color='red', label=f'Standard Deviation {round(median_distance + 2*std,2)}', linewidth=2)
plt.axvline(median_distance + 3*std, color='green', label=f'Standard Deviation {round(median_distance + 3*std,2)}', linewidth=2)

# plt.legend()
plt.title('Mahala Distances by ')
plt.xlabel('Frequency')
plt.ylabel('Distances')

plt.tight_layout()

plt.show()

Split the distances into 3 class intervals. 0-1 as class one, 1-1.5 as class two and 1.5+ is class 3.

Using the Decision Tree Classifier, make a 70-30 split. 70% training and 30% test. 
This split resulted in the highest performance of the model and in turn the accuracy. On
average there was an accuracy of 98%. A visualization of the Decision Tree is as shown below

In [None]:
x = df_sample # Features
y = pd.DataFrame(labels) # class labels
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 70% training and 30% test

# Create Decision Tree classifier
clf = DecisionTreeClassifier()

# Train DTC
clf = clf.fit(x_train, y_train)

# predict the response for the train dataset
# y_pred = clf.predict(x_train)
# print(f"Train Accuracy: {metrics.accuracy_score(y_train, y_pred)}")

# predict the response for the test dataset
y_pred = clf.predict(x_test)
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred)}")

In [None]:
from six import StringIO
from sklearn import tree
from IPython.display import Image  
import pydotplus

feature_cols = ['averageRating', 'numVotes', 'startYear', 'runtimeMinutes']
data = tree.export_graphviz(clf, out_file=None,  
                filled=True, rounded=True, feature_names = feature_cols,class_names=['1','2', '3'])

graph = pydotplus.graph_from_dot_data(data) 
graph.write_png('DecisionTree.png')
Image(graph.create_png())