In [324]:
#import libraries
import numpy as np
import pandas as pd       
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

In [325]:
#load data
df_main = pd.read_csv('../data/model_data.csv', low_memory=False)

In [326]:
#establish model variables

X = df_main['combinedtext']
y = df_main['label']

cvec = CountVectorizer(max_features = 1000, stop_words = 'english')

#convert back to dataframe
X_cvec = pd.DataFrame(cvec.fit_transform(X).todense(),
                      columns = cvec.get_feature_names_out())

In [327]:
# Split the data into the training and testing sets. random state set to 88
X_train, X_test, y_train, y_test = train_test_split(X_cvec,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=88)

<b>Define Functions for Modeling</b>

In [328]:
#to obtain probabilities for specific keywords
def cal_mean(source, brand, text):

    if source == 1: #train
        if brand == 0: #flstudio
            try:
                text = combined_train.loc[(combined_train['label'] == 0)].filter(regex=text).values.mean()
                return text
            except:
                return text
        else: 
            try: #ableton
                text = combined_train.loc[(combined_train['label'] == 1)].filter(regex=text).values.mean()
                return text
            except:
                return text
    else: #test
        if brand == 0: #flstudio
            try:
                text = combined_test.loc[(combined_test['label'] == 0)].filter(regex=text).values.mean()
                return text
            except:
                return text
        else: 
            try: #ableton
                text = combined_test.loc[(combined_test['label'] == 1)].filter(regex=text).values.mean()
                return text
            except:
                return text

In [329]:
combined_train = X_train.copy(deep=True)
combined_train['label'] = y_train.copy(deep=True)
combined_test = X_test.copy(deep=True)
combined_test['label'] = y_test.copy(deep=True)

<b>Random Forest Classifier</b><br/><br/>
train cross validation score = 0.909<br/>
test cross valiadation score = 0.909<br/>
accuracy = 91%<br/>
type 1 error (FP) = 402/7982 = 5.0%<br/>
type 2 error (FN) = 289/7982 = 3.6%<br/><br/>
<b>The train and test score shows that the fitting is good. The accuracy is fairly high and the Type 1 and 2 errors are resonably low. There isn't a strong pressure to further reduce the errors. This result is faily similar to K-Nearest Neighbour Classifier but RFC is marginally better overall.<b/>

In [None]:
#identify the best model using Grid Search
rfc = RandomForestClassifier()

rfc_params = {'n_estimators': [100, 150, 200], 'max_depth': [None, 1, 2, 3, 4, 5]}
rfc_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=88)

rfc_gs = GridSearchCV(rfc, rfc_params, cv=rfc_cv)
rfc_gs.fit(X_train, y_train)

print(rfc_gs.best_score_)
print(rfc_gs.best_params_)

In [None]:
rfc_gs.score(X_train, y_train)

In [None]:
rfc_gs.score(X_test, y_test)

In [None]:
rfc = RandomForestClassifier(n_estimators = rfc_gs.best_params_['n_estimators'], max_depth = rfc_gs.best_params_['max_depth'])
rfc.fit(X_train, y_train)

In [None]:
cross_val_score(rfc, X_train, y_train, cv=rfc_cv).mean()

In [None]:
cross_val_score(rfc, X_test, y_test, cv=rfc_cv).mean()

In [None]:
#indentify keywords that are driving identification
df = pd.DataFrame(rfc.feature_importances_, index=X_train.columns).sort_values(by=0, ascending=False)

df.reset_index(inplace=True)
df = df.rename(columns = {'index':'keyword'})
df = df.rename(columns = {0:'importance'})

#brand 1 = ableton and 0 = flstudio 
#source 1 = train and 0 = test
df['ableton_train'] = [cal_mean(1, 1, x) for x in df["keyword"]]
df['flstudio_train'] = [cal_mean(1, 0, x) for x in df["keyword"]]
df['ableton_test'] = [cal_mean(0, 1, x) for x in df["keyword"]]
df['flstudio_test'] = [cal_mean(0, 0, x) for x in df["keyword"]]

In [None]:
#top 10 influencial words by importance
df.head(10)

In [None]:
y_axis = list(df.iloc[0:40,0])
x_axis = list(df.iloc[0:40,1])
y_axis.reverse()
x_axis.reverse()

plt.barh(y_axis, x_axis)
plt.title('Words by Importance')
plt.ylabel('Words')
plt.xlabel('Count')
plt.show()

In [None]:
df.to_csv(r'../data/rf_df.csv', index=False)

In [None]:
#calculate pred test values
rfc_y_pred_test = rfc.predict(X_test)

In [None]:
#accuracy score
accuracy_score(y_test, rfc_y_pred_test)

In [None]:
#confusion matrix
confusion_matrix(y_test, rfc_y_pred_test)

In [None]:
#classification report
print(classification_report(y_test, rfc_y_pred_test))

<b>K-Nearest Neighbour Classifier</b><br/><br/>
train cross validation score = 0.904<br/>
test cross valiadation score = 0.898<br/>
accuracy = 91%<br/>
type 1 error (FP) = 370/7982 = 4.6%<br/>
type 2 error (FN) = 366/7982 = 4.6%<br/><br/>
<b>The train and test score shows that the fitting is good. The accuracy is fairly high and the Type 1 and 2 errors are resonably low. There isn't a strong pressure to further reduce the errors. This result is faily similar to Random Forest Classifier but RFC is marginally better overall.<b/>

In [None]:
#identify the best model using Grid Search

knnc = KNeighborsClassifier()

knnc_params = {'n_neighbors': list(range(1, 31))}
knnc_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=88)

knnc_gs = GridSearchCV(knnc, knnc_params, cv=knnc_cv)
knnc_gs.fit(X_train, y_train)

print(knnc_gs.best_score_)
print(knnc_gs.best_params_)

In [None]:
knnc_gs.score(X_train, y_train)

In [None]:
knnc_gs.score(X_test, y_test)

In [None]:
knnc = RandomForestClassifier(n_estimators = knnc_gs.best_params_['n_neighbors'])
knnc.fit(X_train, y_train)

In [None]:
cross_val_score(knnc, X_train, y_train, cv=10).mean()

In [None]:
cross_val_score(knnc, X_test, y_test, cv=10).mean()

In [None]:
#indentify keywords that are driving identification
df = pd.DataFrame(knnc.feature_importances_, index=X_train.columns).sort_values(by=0, ascending=False)

df.reset_index(inplace=True)
df = df.rename(columns = {'index':'keyword'})
df = df.rename(columns = {0:'importance'})

#brand 1 = ableton and 0 = flstudio 
#source 1 = train and 0 = test
df['ableton_train'] = [cal_mean(1, 1, x) for x in df["keyword"]]
df['flstudio_train'] = [cal_mean(1, 0, x) for x in df["keyword"]]
df['ableton_test'] = [cal_mean(0, 1, x) for x in df["keyword"]]
df['flstudio_test'] = [cal_mean(0, 0, x) for x in df["keyword"]]

In [None]:
#top 10 influencial words
df.head(10)

In [None]:
y_axis = list(df.iloc[0:40,0])
x_axis = list(df.iloc[0:40,1])
y_axis.reverse()
x_axis.reverse()

plt.barh(y_axis, x_axis)
plt.title('Words by Importance')
plt.ylabel('Words')
plt.xlabel('Count')
plt.show()

In [None]:
df.to_csv(r'../data/knnc_df.csv', index=False)

In [None]:
#calculate pred test values
knnc_y_pred_test = knnc.predict(X_test)

In [None]:
#accuracy score
accuracy_score(y_test, knnc_y_pred_test)

In [None]:
#confusion matrix
confusion_matrix(y_test, knnc_y_pred_test)

In [None]:
#classification report
print(classification_report(y_test, knnc_y_pred_test))

<b>Support Vector Machines</b><br/><br/>
train cross validation score = 0.894<br/>
test cross valiadation score = 0.882<br/>
accuracy = 0.90%<br/>
type 1 error (FP) = 405/7982 = 5.0%<br/>
type 2 error (FN) = 420/7982 = 5.0%<br/><br/>
<b>The train and test score shows that the fitting is good. The accuracy is fairly high and the Type 1 and 2 errors are resonably low. There isn't a strong pressure to further reduce the errors. RFC is still marginally better overall.<b/>

In [None]:
#identify the best model using Grid Search

svc = LinearSVC(max_iter=40000)

# C values to GridSearch over
svc_params = {"C": np.linspace(0.0001, 2, 10)}

svc_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=88)

svc_gs = GridSearchCV(svc, svc_params, cv=svc_cv)
svc_gs.fit(X_test, y_test)

print(svc_gs.best_score_)
print(svc_gs.best_params_)

In [None]:
svc_gs.score(X_train, y_train)

In [None]:
svc_gs.score(X_test, y_test)

In [None]:
svc = LinearSVC(max_iter=20000, C = svc_gs.best_params_['C'])

svc.fit(X_train, y_train)

In [None]:
cross_val_score(svc, X_train, y_train, cv=svc_cv).mean()

In [None]:
cross_val_score(svc, X_test, y_test, cv=svc_cv).mean()

In [None]:
#calculate pred test values
svc_y_pred_test = svc.predict(X_test)

In [None]:
#accuracy score
accuracy_score(y_test, svc_y_pred_test)

In [None]:
#confusion matrix
confusion_matrix(y_test, svc_y_pred_test)

In [None]:
#classification report
print(classification_report(y_test, svc_y_pred_test))

In [None]:
df.to_csv(r'../data/svm_df.csv', index=False)

In [None]:
y_test_check = y_test.reset_index()
y_test_check['rfc'] = rfc_y_pred_test
y_test_check['knnc'] = knnc_y_pred_test
y_test_check['svc'] = svc_y_pred_test
y_test_check['type1'] = y_test_check.rfc < y_test_check.label
y_test_check['type2'] = y_test_check.rfc > y_test_check.label

<b>Error Examination: A sample of 10 wrongly classified</b><br><br>
<b>Type 1 Error (Classified as FL Studio but is Ableton)</b><br>
Post do not have the clear Ableton and FL Studio keywords, posts seem genuine that can appear in either subreddit.<br><br>
<b>Type 2 Error (Classified as Ableton but is FL Studio)</b><br>
Post do not have the clear Ableton and FL Studio keywords, posts seem genuine that can appear in either subreddit. A few keywords that are more for Ableton appears in these posts.

In [None]:
#Type 1 Error
#random index generated = 19902, 15213, 20472, 21939 and 19883
y_test_check[y_test_check.type1 == True].sample(n = 5)

In [None]:
#lable as 1 but is 0 (1 = Ableton, 0 = Fl Studio)
df_main.combinedtext[19902]

In [None]:
df_main.combinedtext[15213]

In [None]:
df_main.combinedtext[20472]

In [None]:
df_main.combinedtext[21939]

In [None]:
df_main.combinedtext[19883]

In [None]:
#Type 2 Error
#random index generated = 3619, 7862, 7809, 10261 and 10552
y_test_check[y_test_check.type2 == True].sample(n = 5)

In [None]:
#lable as 0 but is 1 (1 = Ableton, 0 = Fl Studio)
df_main.combinedtext[3619]

In [None]:
df_main.combinedtext[7862]

In [None]:
df_main.combinedtext[7809]

In [None]:
df_main.combinedtext[10261]

In [None]:
df_main.combinedtext[10552]

<b>Model Evaluation Summary</b>

|Model|Train Cross Validation|Test Cross Validation|Accuracy|Type 1 Error|Type 2 Error|
|:---|---:|---:|---:|---:|---:|
|Random Forest Classifier|0.909|0.909|91%|5.1%|3.6%|
|K-Nearest Neighbour Classifier|0.904|0.898|91%|4.6%|4.6%|
|Support Vector Machines|0.894|0.882|90%|5.0%|5.0%|

<b>All the three models have good results but the Random Forest Classifier has a slight edge over the others. It was, all, a very close fight between the models.</b>

<b>Findings</b>

In [None]:
#reload result data
df_result = pd.read_csv(r'../data/rf_df.csv', low_memory=False)

In [None]:
#create comparison - which is more for Ableton of FL Studio
df_result['more_ableton'] = df_result.ableton_train > df_result.flstudio_train

In [None]:
    #show the top important words more closely related to Ableton
df_result_ableton = df_result[df_result.more_ableton == True]
y_axis = list(df_result_ableton.iloc[0:20,0])
x_axis = list(df_result_ableton.iloc[0:20,1])
y_axis.reverse()
x_axis.reverse()

plt.barh(y_axis, x_axis)
plt.title('Words for Ableton')
plt.ylabel('Words')
plt.xlabel('Count')
plt.show()

In [None]:
#show the top important words more closely related to Fl Studio
df_result_ableton = df_result[df_result.more_ableton == False]
y_axis = list(df_result_ableton.iloc[0:20,0])
x_axis = list(df_result_ableton.iloc[0:20,1])
y_axis.reverse()
x_axis.reverse()

plt.barh(y_axis, x_axis)
plt.title('Words for FL Studio')
plt.ylabel('Words')
plt.xlabel('Count')
plt.show()