In [1]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
import bay12_solution_eposts as solution

In [2]:
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
post, thread = solution.prepare.load_dfs('train')

post.head()

Unnamed: 0,thread_num,user,text,quotes
0,45016,Mephansteras,Basically this is where we talk about what ga...,
1,45016,dakarian,The currently running or about to run games i...,
2,45016,webadict,And mine s started I ll try to limit m...,
3,45016,ExKirby,Mine needs players not,
4,45016,RedWarrior0,Mine can wait a bit BYORPE is a problem as it...,


In [3]:
thread.head()

Unnamed: 0,thread_num,thread_name,thread_label,thread_replies,thread_label_id
0,45016,Games Threshold Discussion and List [Vote for ...,other,5703,8
1,88720,New Player's Guide to the Subforum - New to Ma...,other,961,8
2,39338,Mafia: A Basic Tutorial,other,79,8
3,34959,Paranormal Mafia Game - Rules Discussion,other,1719,8
4,64229,Notable Games Archive,other,307,8


In [4]:
thread = thread.set_index('thread_num')

In [5]:
label_map = solution.prepare.load_label_map()

# Features

In [6]:
def count_prob(x):    
    has_quotes = 0
    for i in x:
        if i:
            has_quotes += 1
    return has_quotes / x.size
    
def get_mean_len_text(post):
    
    mean_len_text = post.groupby('thread_num',sort=False)['text'].apply(lambda x: np.mean(x.str.len())).reset_index(name='mean_len_text')
    mean_len_text = mean_len_text.set_index('thread_num')
    return mean_len_text

def get_quotes_probability(post):
    quotes_probability = post.groupby('thread_num', sort=False)['quotes'].apply(count_prob).reset_index(name='quotes_probability')
    quotes_probability = quotes_probability.set_index('thread_num')
    return quotes_probability

mean_len_text = get_mean_len_text(post)
quotes_probability = get_quotes_probability(post)


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1, 1), min_df=10)

In [8]:
word_vectors_raw = cv.fit_transform(thread['thread_name'])

In [9]:
word_df = pd.DataFrame(word_vectors_raw.toarray(), columns=cv.get_feature_names(), index=thread.index)
word_df.head()

Unnamed: 0_level_0,all,and,beginner,byor,day,discussion,end,for,game,in,...,sign,signups,the,thread,to,town,up,vote,win,wins
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45016,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
88720,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,2,0,0,0,0,0
39338,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34959,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
64229,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X = pd.concat([
        mean_len_text['mean_len_text'],
        quotes_probability['quotes_probability'],
        (thread['thread_replies'] + 1).rename('posts'), 
        word_df,
    ], axis='columns')
X.head()

Unnamed: 0_level_0,mean_len_text,quotes_probability,posts,all,and,beginner,byor,day,discussion,end,...,sign,signups,the,thread,to,town,up,vote,win,wins
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45016,176.410589,0.0,5704,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
88720,148.083247,0.0,962,0,0,0,0,0,0,0,...,0,0,1,0,2,0,0,0,0,0
39338,305.1,0.0,80,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34959,462.841279,0.0,1720,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
64229,189.493506,0.0,308,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
y = thread['thread_label_id']

# Split


In [12]:
from sklearn.model_selection import train_test_split

validation_pct = 0.25
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=validation_pct, random_state=99)

# Fit


In [13]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 120)
forest.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=120, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
y_train_pred = pd.Series(
    forest.predict(X_train), 
    index=X_train.index, 
)
y_val_pred = pd.Series(
    forest.predict(X_val), 
    index=X_val.index, 
)

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [16]:
def confusion_df(y_actual, y_pred):
    res = pd.DataFrame(
        confusion_matrix(y_actual, y_pred, labels=label_map.values),
        index=label_map.index.rename('predicted'),
        columns=label_map.index.rename('actual'),
    )
    return res

In [17]:
confusion_df(y_val, y_val_pred).style.highlight_max()

actual,bastard,beginners-mafia,byor,classic,closed-setup,cybrid,kotm,non-mafia-game,other,paranormal,supernatural,vanilla,vengeful
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bastard,0,0,0,0,2,0,0,0,3,0,0,0,0
beginners-mafia,0,1,0,0,0,0,0,0,2,0,1,0,0
byor,0,0,1,0,0,0,0,0,1,0,0,0,0
classic,0,0,0,1,5,0,0,0,2,0,0,0,0
closed-setup,0,0,0,1,5,0,0,0,1,0,0,0,0
cybrid,0,0,0,0,1,0,0,0,0,0,0,0,0
kotm,0,0,0,0,0,0,0,0,1,0,0,0,0
non-mafia-game,0,0,0,0,0,0,0,0,0,0,0,0,0
other,0,0,1,0,1,0,0,0,51,2,0,0,0
paranormal,0,0,0,0,0,0,0,0,0,3,0,0,0


In [18]:
print("Test accuracy:", accuracy_score(y_train, y_train_pred))
print("Validation accuracy:", accuracy_score(y_val, y_val_pred))

Test accuracy: 1.0
Validation accuracy: 0.7111111111111111


In [19]:
report = classification_report(y_val, y_val_pred, labels=label_map.values, target_names=label_map.index)
print(report)

                 precision    recall  f1-score   support

        bastard       0.00      0.00      0.00         5
beginners-mafia       0.50      0.25      0.33         4
           byor       0.50      0.50      0.50         2
        classic       0.50      0.12      0.20         8
   closed-setup       0.36      0.71      0.48         7
         cybrid       0.00      0.00      0.00         1
           kotm       0.00      0.00      0.00         1
 non-mafia-game       0.00      0.00      0.00         0
          other       0.84      0.93      0.88        55
     paranormal       0.60      1.00      0.75         3
   supernatural       0.00      0.00      0.00         0
        vanilla       0.00      0.00      0.00         1
       vengeful       1.00      0.67      0.80         3

      micro avg       0.71      0.71      0.71        90
      macro avg       0.33      0.32      0.30        90
   weighted avg       0.67      0.71      0.67        90



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# Predict


In [20]:
post_test, thread_test = solution.prepare.load_dfs('test')
thread_test = thread_test.set_index('thread_num')

In [21]:
word_vectors_raw_test = cv.transform(thread_test['thread_name'])
word_df_test = pd.DataFrame(word_vectors_raw_test.toarray(), columns=cv.get_feature_names(), index=thread_test.index)


In [22]:
mean_len_text_test = get_mean_len_text(post_test)
quotes_probability_test = get_quotes_probability(post_test)

In [23]:
X_test = pd.concat([
        mean_len_text_test['mean_len_text'],
        quotes_probability_test['quotes_probability'],
        (thread_test['thread_replies'] + 1).rename('posts'), 
        word_df_test,
    ], axis='columns')
X_test.head()

Unnamed: 0_level_0,mean_len_text,quotes_probability,posts,all,and,beginner,byor,day,discussion,end,...,sign,signups,the,thread,to,town,up,vote,win,wins
thread_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
126856,196.948718,0.0,39,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132415,696.820755,0.0,212,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
134482,452.997895,0.0,475,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
133728,708.946809,0.0,564,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
134270,68.727273,0.0,11,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
y_test_pred = pd.Series(
    forest.predict(X_test), 
    index=X_test.index, 
)

In [25]:
result = thread_test.copy()
result['thread_label_id'] = y_test_pred
result = result.reset_index()[['thread_num', 'thread_label_id']]

# Export

In [26]:
out_dir = os.path.abspath('output')
os.makedirs(out_dir, exist_ok=True)
result.to_csv(
    os.path.join(out_dir, 'forest_predict.csv'),
    index=False, header=True, encoding='utf-8', 
)