In [1]:
import numpy as np
import pandas as pd

# Step 1. Feature Extraction

In [2]:
# get thread_data from csv files 
import os

folder_path = '/Users/lily/Desktop/data_output/'
all_files = os.listdir(folder_path)
thread_data = pd.DataFrame()
for file_name in all_files:
    doc_df = pd.read_csv(folder_path+file_name, encoding=('iso-8859-1'))
    thread_data = thread_data.append(doc_df, ignore_index= True)
    
thread_data['Message Bodies'] = [w.replace('\\xa0', ' ').replace('\\n', ' ') for w in thread_data['Message Bodies']]

# export data to csv file
thread_data.to_csv('data_original.csv', encoding='utf-8', index=False)

## Extract thread based features

In [3]:
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()


for i in range(len(thread_data)):
    # do counts
    Message_soup = BeautifulSoup(thread_data['Message HTML'][i], 'html.parser')
    thread_data.loc[i, 'img_count'] = len(Message_soup.find_all('img')) - len(Message_soup.find_all('img', class_='emoticon'))  # count image tags
    thread_data.loc[i, 'p_count'] = len(Message_soup.find_all('p'))    # count p tags
    thread_data.loc[i, 'word_count'] = len(Message_soup.get_text().split(' ')) # count words
    
    # get sentiment
    ss = sid.polarity_scores(thread_data['Message Bodies'][i])
    
    thread_data.loc[i, 'compound'] = ss['compound']
    thread_data.loc[i, 'neg'] = ss['neg']
    thread_data.loc[i, 'neu'] = ss['neu']
    thread_data.loc[i, 'pos'] = ss['pos']
    
    thread_data.loc[i, 'user_count'] = len(thread_data['User List'][i].split(','))


In [4]:
# export data to csv file
thread_data = thread_data.drop(columns=['Thread ID', 'Message List', 'User List', 'Message HTML', 'Post Times', 'Message Bodies'])
thread_data['Solution Count'] = [0 if x==0 else 1 for x in thread_data['Solution Count']]  
thread_data.to_csv('data.csv', encoding='utf-8', index=False)

## Extract content based features

In [5]:
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from collections import Counter
from nltk.tokenize.moses import MosesTokenizer
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

n_features = 2000
mosesTokenizer = MosesTokenizer()
stop_words = set(stopwords.words('english'))
more_stop_words = [',','&quot;','(',')','/','&apos;t','&apos;re','&apos;s','&apos;ve','&gt;','+','~','-','*','\\',':','--', '\'',
                   '#','$','%','&amp;','&apos;','&apos;d','&apos;ll','&apos;m','..','...','....','"']
punct_stop_words = ['?','.']
all_stop_words = stop_words.union(more_stop_words)
all_stop_words_punct = all_stop_words.union(punct_stop_words)


In [6]:
class LemmaTokenizer(object):
     def __init__(self):
         self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
        #Part-Of-Speech Tagged and Word Tokenized 
        tagged = pos_tag((word_tokenize(doc)))
        lems = []

        #For each tagged word, lemmatize the nouns, verbs, and adjectives
        for w,t in tagged:

            #temporay variable to potentially change the word
            l = w 
            #noun
            if(t[0] == 'N'):
                l = self.wnl.lemmatize(w, 'n')
            #verb
            elif(t[0] == 'V'):
                l = self.wnl.lemmatize(w, 'v')
            #adjective    
            elif(t[0] == 'J'):
                l = self.wnl.lemmatize(w, 'a')
    
            lems.append(l)    

        #return list of lemmed words
        return lems

In [7]:
# prepare data
from sklearn.model_selection import train_test_split

thread_df = pd.read_csv('data_original.csv', encoding='utf-8')

# separate features and targets
thread_X = thread_df.drop(columns=['Solution Count', 'Thread ID', 'Message List', 'User List', 'Message HTML', 'Post Times'])
thread_y = [0 if x==0 else 1 for x in thread_df['Solution Count']]  


X_train, X_test, y_train, y_test = train_test_split(thread_X, 
                                                    thread_y, 
                                                    random_state=42, 
                                                    train_size=.7, 
                                                    test_size=.3)

X_train_features = X_train.drop(columns=['Message Bodies'])
X_test_features = X_test.drop(columns=['Message Bodies'])

X_train_csr = csr_matrix(X_train_features.values.astype(int))
X_test_csr = csr_matrix(X_test_features.values.astype(int))


### Extract tf-idf features

In [8]:
print("Extracting tf-idf features...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   strip_accents = 'unicode',
                                   tokenizer = LemmaTokenizer(),
                                   ngram_range = (1,3),
                                   stop_words=all_stop_words_punct)

tfidf_train = tfidf_vectorizer.fit_transform(X_train['Message Bodies'])
tfidf_test = tfidf_vectorizer.transform(X_test['Message Bodies'])

X_train_tfidf = hstack((X_train_csr, tfidf_train))
X_test_tfidf = hstack((X_test_csr, tfidf_test))

# print tfidf features
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print (tfidf_feature_names)

Extracting tf-idf features...


### Extract tf features

In [9]:
print("Extracting tf features...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                strip_accents = 'unicode',
                                tokenizer = LemmaTokenizer(),
                                ngram_range = (1,3),
                                stop_words=all_stop_words_punct)

tf_train = tf_vectorizer.fit_transform(X_train['Message Bodies'])
tf_test = tf_vectorizer.transform(X_test['Message Bodies'])

X_train_tf = hstack((X_train_csr, tf_train))
X_test_tf = hstack((X_test_csr, tf_test))

# print tfidf features
tf_feature_names = tf_vectorizer.get_feature_names()
print (tf_feature_names)

Extracting tf features...


# Step 2. Data Pre-Processing

In [10]:
# undersample data
from imblearn.under_sampling import RandomUnderSampler 

def undersample(X_train, y_train):
    rus = RandomUnderSampler(random_state=42)
    X_train_res, y_train_res = rus.fit_sample(X_train, y_train)
        
    return X_train_res, y_train_res

In [11]:
# standard scale data
from sklearn.preprocessing import StandardScaler

def standardscale(X_train, X_test):
    # transform scale of data
    ss = StandardScaler()
    X_train_scaled = ss.fit_transform(X_train)
    X_test_scaled = ss.transform(X_test)
    return X_train_scaled, X_test_scaled

# Step 3. Model Training

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def train(X_train, X_test, y_train, y_test):
    
    # set up models
    rf = RandomForestClassifier(random_state = 42, class_weight='balanced')
    svm = SVC(random_state = 42, class_weight='balanced')
    mlp = MLPClassifier(random_state = 42)

    models = [('Random Forest', rf), 
          ('Support Vector Machine', svm),
          ('Neural Networks', mlp)]

    performance = {'Model': [], 'Accuracy': [], 'Precision': [], 'Recall': []}

    for model_name, cur_model in models:
        cur_model.fit(X_train, y_train)
        y_predicted = cur_model.predict(X_test)
        
        accuracy = accuracy_score(y_predicted, y_test)
        precision = precision_score(y_predicted, y_test)
        recall = recall_score(y_predicted, y_test)
        
        performance['Model'].append(model_name)
        performance['Accuracy'].append(accuracy)
        performance['Precision'].append(precision)
        performance['Recall'].append(recall)

    return (performance)

## Baseline

In [13]:
# baseline
performance = train(X_train_features, X_test_features, y_train, y_test)
df = pd.DataFrame(performance, index=performance['Model'], columns = ['Accuracy', 'Precision', 'Recall'])
df

Unnamed: 0,Accuracy,Precision,Recall
Random Forest,0.700333,0.422072,0.590786
Support Vector Machine,0.655667,0.007744,0.5
Neural Networks,0.559667,0.662149,0.413043


## Experiment One

In [14]:
# get data
from sklearn.model_selection import train_test_split

thread_df = pd.read_csv('data.csv', encoding='utf-8')

X_data = thread_df.drop(columns=['Solution Count'])
y_data = thread_df['Solution Count']

#Create 70-30 splits
X_train, X_test, y_train, y_test = train_test_split(X_data, 
                                                    y_data, 
                                                    random_state=42, 
                                                    train_size=.7, 
                                                    test_size=.3)

In [15]:
# transform data
X_train_scaled, X_test_scaled = standardscale(X_train, X_test)

#     # undersample data
#     X_train_res, y_train_res = standardscale(X_train_scaled, y_train)

In [16]:
# baseline + thread based features
from sklearn.metrics import accuracy_score
from sklearn import metrics

performance = train(X_train_scaled, X_test_scaled, y_train, y_test)
df = pd.DataFrame(performance, index=performance['Model'], columns = ['Accuracy', 'Precision', 'Recall'])
df

Unnamed: 0,Accuracy,Precision,Recall
Random Forest,0.709667,0.395934,0.623476
Support Vector Machine,0.695,0.601162,0.552491
Neural Networks,0.721,0.454017,0.632075


## Experiment Two

In [17]:
# baseline + tf
performance = train(X_train_tf, X_test_tf, y_train, y_test)
df = pd.DataFrame(performance, index=performance['Model'], columns = ['Accuracy', 'Precision', 'Recall'])
df

Unnamed: 0,Accuracy,Precision,Recall
Random Forest,0.665667,0.270087,0.528409
Support Vector Machine,0.634667,0.105518,0.3879
Neural Networks,0.637,0.208132,0.442387


In [18]:
# baseline + tf-idf
performance = train(X_train_tfidf, X_test_tfidf, y_train, y_test)
df = pd.DataFrame(performance, index=performance['Model'], columns = ['Accuracy', 'Precision', 'Recall'])
df

Unnamed: 0,Accuracy,Precision,Recall
Random Forest,0.664667,0.235237,0.529412
Support Vector Machine,0.625,0.133591,0.375
Neural Networks,0.637667,0.066796,0.359375
