In [3]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import os
from time import time


# Feature Extraction

In [2]:
folder_path = 'Desktop/data_output/'
all_files = os.listdir(folder_path)
thread_data_1 = pd.DataFrame()
for file_name in all_files:
    doc_df = pd.read_csv(folder_path+file_name, encoding=('iso-8859-1'))
    thread_data_1 = thread_data_1.append(doc_df, ignore_index= True)

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

for i in range(len(thread_data_1)):
    # do counts
    Message_soup = BeautifulSoup(thread_data_1['Message HTML'][i], 'html.parser')
    thread_data_1.loc[i, 'img_count'] = len(Message_soup.find_all('img')) - len(Message_soup.find_all('img', class_='emoticon'))  # count image tags
    thread_data_1.loc[i, 'p_count'] = len(Message_soup.find_all('p'))    # count p tags
    thread_data_1.loc[i, 'word_count'] = len(Message_soup.get_text().split(' ')) # count words
    
    # get sentiment
    Message_bodies = thread_data_1['Message Bodies'][i].replace("\\xa0", "").replace("\\n", "")
    ss = sid.polarity_scores(Message_bodies)
    
    thread_data_1.loc[i, 'compound'] = ss['compound']
    thread_data_1.loc[i, 'neg'] = ss['neg']
    thread_data_1.loc[i, 'neu'] = ss['neu']
    thread_data_1.loc[i, 'pos'] = ss['pos']



In [24]:
for i in range(len(thread_data_1)):
    thread_data_1.loc[i, 'user_count'] = len(thread_data_1['User List'][i].split(','))

In [26]:
thread_data_1.to_csv('features.csv', encoding='utf-8')

# Model Training

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def train(Xdata, ydata):
    #Create 70-30 splits
    X_train, X_test, y_train, y_test = train_test_split(Xdata, 
                                                    ydata, 
                                                    random_state=42, 
                                                    train_size=.7, 
                                                    test_size=.3)
    
    # transform scale of data
    ss = StandardScaler()
    X_train_scaled = ss.fit_transform(X_train)
    X_test_scaled = ss.transform(X_test)
    
    # set up models
    rf = RandomForestClassifier(random_state = 42, class_weight="balanced")
    svm = SVC(random_state = 42, class_weight="balanced")
    mlp = MLPClassifier(random_state = 42)

    models = [('Random Forest', rf), 
          ('Support Vector Machine', svm),
          ('Neural Networks', mlp)]

    performance = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': []}

    for model_name, cur_model in models:
        cur_model.fit(X_train_scaled, y_train)
        y_predicted = cur_model.predict(X_test_scaled)
        
        accuracy = accuracy_score(y_predicted, y_test)
        precision = precision_score(y_predicted, y_test)
        recall = recall_score(y_predicted, y_test)
        
        performance['Model'].append(model_name)
        performance['Accuracy'].append(accuracy)
        performance['Precision'].append(precision)
        performance['Recall'].append(recall)

    return (performance)

In [5]:
thread_df = pd.read_csv('features.csv', encoding='utf-8')

In [6]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import train_test_split

thread_X = thread_df.drop(columns=['Solution Count', 'Thread ID', 'Message List', 'User List', 'Message HTML', 'Post Times', 'Message Bodies'])
thread_y = [0 if x==0 else 1 for x in thread_df['Solution Count']]  

performance = train(thread_X, thread_y)
df = pd.DataFrame(performance, index=performance['Model'], columns = ['Accuracy', 'Precision', 'Recall'])

In [7]:
df

Unnamed: 0,Accuracy,Precision,Recall
Random Forest,0.706667,0.391094,0.616794
Support Vector Machine,0.7,0.584705,0.56186
Neural Networks,0.718667,0.515973,0.607754
