In [295]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.utils import to_categorical
from keras.layers import Dense, Dropout, Activation
from keras.models import Sequential

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.decomposition import PCA

# Functions for Data: Tweets -> Int Representation

In [240]:
def strings_to_ints(data):
    unique_strings = []
    new_data = []
    
    for i in range(len(data)):
        if data[i] not in unique_strings:
            unique_strings.append(data[i])
            
        for j in range(len(unique_strings)):
            if unique_strings[j] == data[i]:
                new_data.append(j)
    
    return np.array(new_data)

In [241]:
def preprocess_word(word):
    new_word = ""
    for i in range(len(word)):
        if word[i].isalnum():
            new_word += word[i]
    return new_word.lower()

In [242]:
def get_clean_data(raw_data):
    clean_data = []
    master_data_list = []
    
    for data in raw_data:
        master_data_list.append(data.split())
    
    for data in master_data_list:
        temp_data = []
        for word in data:
            new_word = preprocess_word(word)
            if new_word != "":
                temp_data.append(new_word)
        clean_data.append(temp_data)
    
    return clean_data

In [243]:
def get_word_frequency(text_data):
    frequency_dict = {}
    
    for data in text_data:
        for word in data:
            if word not in frequency_dict:
                frequency_dict[word] = 1
            else:
                frequency_dict[word] += 1
                
    return frequency_dict

In [244]:
def get_vocab(vocab_size, clean_data):
    words = [""]
    
    word_frequency_dict = get_word_frequency(clean_data)
    frequency_values = list(word_frequency_dict.values())
    frequency_values.sort(reverse=True)
    
    count = vocab_size
    while count > 0:
        max_value = np.max(frequency_values)
        for i in word_frequency_dict:
            if word_frequency_dict[i] == max_value:
                words.append(i)
                break
    
        frequency_values.remove(max_value)
        word_frequency_dict.pop(i)
        count -= 1
    
    words.pop(0)
    return words

In [245]:
def fit_to_vocab(words, clean_data):
    input_data = np.zeros((len(clean_data), len(words)))
    for i in range(len(clean_data)):
        for j in range(len(words)):
            input_data[i][j] = clean_data[i].count(words[j])
    return input_data

# Main: Data Preparation

In [246]:
df = pd.read_csv('DemRepTweets.csv')
data = df['Tweet'].values
y = strings_to_ints(df['Party'].values)

In [247]:
vocab_size = 500

clean_data = get_clean_data(data)
words = get_vocab(vocab_size, clean_data)
x = fit_to_vocab(words, clean_data)

In [248]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

# Main: Models and Ensemble Methods

In [251]:
start = time.time()
RF_1 = RandomForestClassifier(random_state=1, criterion='gini', max_features=8, n_estimators=25, n_jobs=-1)
RF_1.fit(x_train, y_train)
print(format(RF_1.score(x_test, y_test),'.5f'),"in",format(time.time() - start,'.2f'),"s")

0.67719 in 10.25 s


In [281]:
start = time.time()
RF_2 = RandomForestClassifier(random_state=1, criterion='gini', max_depth=80, max_features=6, n_estimators=40, n_jobs=-1)
RF_2.fit(x_train, y_train)
print(format(RF_2.score(x_test, y_test),'.5f'),"in",format(time.time() - start,'.2f'),"s")

0.67476 in 9.55 s


In [263]:
start = time.time()
RF_3 = RandomForestClassifier(random_state=1, criterion='entropy', max_features=8, n_estimators=25, n_jobs=-1)
RF_3.fit(x_train, y_train)
print(format(RF_3.score(x_test, y_test),'.5f'),"in",format(time.time() - start,'.2f'),"s")

0.67222 in 11.36 s


In [282]:
start = time.time()
RF_4 = RandomForestClassifier(random_state=1, criterion='entropy', max_depth=80, max_features=6, n_estimators=40, n_jobs=-1)
RF_4.fit(x_train, y_train)
print(format(RF_4.score(x_test, y_test),'.5f'),"in",format(time.time() - start,'.2f'),"s")

0.67407 in 12.80 s


In [325]:
def get_models():
    return [('rf_1',RF_1),('rf_2',RF_2),('rf_3',RF_3),('rf_4',RF_4)]

In [326]:
# default voting classifier from sklearn

ensemble_model = VotingClassifier(get_models(), voting='soft')
ensemble_model.fit(x_train, y_train)
ensemble_model.score(x_test, y_test)

0.6924589405505436