In [3]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import os
from time import time


# Feature Extraction

In [2]:
folder_path = 'Desktop/data_output/'
all_files = os.listdir(folder_path)
thread_data_1 = pd.DataFrame()
for file_name in all_files:
    doc_df = pd.read_csv(folder_path+file_name, encoding=('iso-8859-1'))
    thread_data_1 = thread_data_1.append(doc_df, ignore_index= True)

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

for i in range(len(thread_data_1)):
    # do counts
    Message_soup = BeautifulSoup(thread_data_1['Message HTML'][i], 'html.parser')
    thread_data_1.loc[i, 'img_count'] = len(Message_soup.find_all('img')) - len(Message_soup.find_all('img', class_='emoticon'))  # count image tags
    thread_data_1.loc[i, 'p_count'] = len(Message_soup.find_all('p'))    # count p tags
    thread_data_1.loc[i, 'word_count'] = len(Message_soup.get_text().split(' ')) # count words
    
    # get sentiment
    Message_bodies = thread_data_1['Message Bodies'][i].replace("\\xa0", "").replace("\\n", "")
    ss = sid.polarity_scores(Message_bodies)
    
    thread_data_1.loc[i, 'compound'] = ss['compound']
    thread_data_1.loc[i, 'neg'] = ss['neg']
    thread_data_1.loc[i, 'neu'] = ss['neu']
    thread_data_1.loc[i, 'pos'] = ss['pos']
    
    thread_data_1.loc[i, 'user_count'] = len(thread_data_1['User List'][i].split(','))



In [26]:
thread_data_1.to_csv('features.csv', encoding='utf-8')

# Model Training

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def train(Xdata, ydata):
    #Create 70-30 splits
    X_train, X_test, y_train, y_test = train_test_split(Xdata, 
                                                    ydata, 
                                                    random_state=42, 
                                                    train_size=.7, 
                                                    test_size=.3)
    
    # transform scale of data
    ss = StandardScaler()
    X_train_scaled = ss.fit_transform(X_train)
    X_test_scaled = ss.transform(X_test)
    
    # set up models
    rf = RandomForestClassifier(random_state = 42, class_weight="balanced")
    svm = SVC(random_state = 42, class_weight="balanced")
    mlp = MLPClassifier(random_state = 42)

    models = [('Random Forest', rf), 
          ('Support Vector Machine', svm),
          ('Neural Networks', mlp)]

    performance = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': []}

    for model_name, cur_model in models:
        cur_model.fit(X_train_scaled, y_train)
        y_predicted = cur_model.predict(X_test_scaled)
        
        accuracy = accuracy_score(y_predicted, y_test)
        precision = precision_score(y_predicted, y_test)
        recall = recall_score(y_predicted, y_test)
        
        performance['Model'].append(model_name)
        performance['Accuracy'].append(accuracy)
        performance['Precision'].append(precision)
        performance['Recall'].append(recall)

    return (performance)

In [4]:
thread_df = pd.read_csv('features.csv', encoding='utf-8')

In [6]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import train_test_split

thread_X = thread_df.drop(columns=['Solution Count', 'Thread ID', 'Message List', 'User List', 'Message HTML', 'Post Times', 'Message Bodies'])
thread_y = [0 if x==0 else 1 for x in thread_df['Solution Count']]  

performance = train(thread_X, thread_y)
df = pd.DataFrame(performance, index=performance['Model'], columns = ['Accuracy', 'Precision', 'Recall'])

In [7]:
df

Unnamed: 0,Accuracy,Precision,Recall
Random Forest,0.706667,0.391094,0.616794
Support Vector Machine,0.7,0.584705,0.56186
Neural Networks,0.718667,0.515973,0.607754


# Word based features

In [14]:
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from collections import Counter
from nltk.tokenize.moses import MosesTokenizer
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

n_features = 2000
mosesTokenizer = MosesTokenizer()
stop_words = set(stopwords.words('english'))
more_stop_words = [',','&quot;','(',')','/','&apos;t','&apos;re','&apos;s','&apos;ve','&gt;','+','~','-','*','\\',':','--', '\'',
                   '#','$','%','&amp;','&apos;','&apos;d','&apos;ll','&apos;m','..','...','....','"']
punct_stop_words = ['?','.']
all_stop_words = stop_words.union(more_stop_words)
all_stop_words_punct = all_stop_words.union(punct_stop_words)


In [16]:
class LemmaTokenizer(object):
     def __init__(self):
         self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
        #Part-Of-Speech Tagged and Word Tokenized 
        tagged = pos_tag((word_tokenize(doc)))
        lems = []

        #For each tagged word, lemmatize the nouns, verbs, and adjectives
        for w,t in tagged:

            ## { Part-of-speech constants
            ## ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
            ## }

            #temporay variable to potentially change the word
            l = w 
            #noun
            if(t[0] == 'N'):
                l = self.wnl.lemmatize(w, 'n')
            #verb
            elif(t[0] == 'V'):
                l = self.wnl.lemmatize(w, 'v')
            #adjective    
            elif(t[0] == 'J'):
                l = self.wnl.lemmatize(w, 'a')
    
            lems.append(l)    
            # if(l != w):
            #     print('{} {} {}'.format(w,t,l))

        #return list of lemmed words
        return lems

In [17]:
thread_X = thread_df.drop(columns=['Solution Count', 'Thread ID', 'Message List', 'User List', 'Message HTML', 'Post Times', 'Message Bodies'])
thread_y = [0 if x==0 else 1 for x in thread_df['Solution Count']]  

# Use tf-idf features
thread_X_csr = csr_matrix(thread_X.values.astype(int))
print("Extracting tf-idf features...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   strip_accents = 'unicode',
                                   tokenizer = LemmaTokenizer(),
                                   ngram_range = (1,3),
                                   stop_words=all_stop_words_punct)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(thread_df['Message Bodies'])
print("done in %0.3fs." % (time() - t0))
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print (tfidf_feature_names)
#print (type(tfidf))

tfidf_thread_X = hstack((thread_X_csr, tfidf))
#print (tfidf_thread_X)

#Use tf (raw term count)
print("Extracting tf features...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                strip_accents = 'unicode',
                                tokenizer = LemmaTokenizer(),
                                ngram_range = (1,3),
                                stop_words=all_stop_words_punct)
t0 = time()
tf = tf_vectorizer.fit_transform(thread_df['Message Bodies'])
print("done in %0.3fs." % (time() - t0))
tf_feature_names = tf_vectorizer.get_feature_names()
print (tf_feature_names)

tf_thread_X = hstack((thread_X_csr, tf))

Extracting tf-idf features...
done in 634.709s.
Extracting tf features...


done in 622.190s.


In [21]:
from imblearn.under_sampling import RandomUnderSampler 

rus = RandomUnderSampler(random_state=42)
tfidf_thread_X_res, tfidf_thread_y_res = rus.fit_sample(tfidf_thread_X, thread_y)
tf_thread_X_res, tf_thread_y_res = rus.fit_sample(tf_thread_X, thread_y)
thread_X_res, thread_y_res = rus.fit_sample(thread_X, thread_y)
print('Resampled dataset shape {}'.format(Counter(thread_y_res)))


Resampled dataset shape Counter({0: 3581, 1: 3581})


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def train_basic_rf(Xdata, ydata):
    #Create 70-30 splits
    Xtrain, Xtest, ytrain, y_test = train_test_split(Xdata, 
                                                    ydata, 
                                                    random_state=42, 
                                                    train_size=.7, 
                                                    test_size=.3)
    
    # print('Baseline')
    curModel = RandomForestClassifier().fit(Xtrain, ytrain)
    startTime = time()
    
    y_predicted = curModel.predict(Xtest)
        
    accuracy = accuracy_score(y_predicted, y_test)
    precision = precision_score(y_predicted, y_test)
    recall = recall_score(y_predicted, y_test)
        
    print("Train accuracy score: {:.8f}".format(accuracy))
    print("Train precision score: {:.8f}".format(precision))
    print("Train recall score: {:.8f}".format(recall))
    print("done in %0.3fs." % (time()-startTime))


In [37]:
#try without text data
print ('Creating model for no text training...')
train_basic_rf(thread_X_res, thread_y_res)

Creating model for no text training...
Train accuracy score: 0.64495114
Train precision score: 0.60151803
Train recall score: 0.64892528
done in 0.010s.


In [38]:
print ('Creating model for tfidf training...')
train_basic_rf(tfidf_thread_X_res, tfidf_thread_y_res)

Creating model for tfidf training...
Train accuracy score: 0.62866450
Train precision score: 0.56261860
Train recall score: 0.63763441
done in 0.020s.


In [39]:
print ('Creating model for tf training...')
train_basic_rf(tf_thread_X_res, tf_thread_y_res)

Creating model for tf training...
Train accuracy score: 0.60772452
Train precision score: 0.53984820
Train recall score: 0.61380798
done in 0.018s.
