In [17]:
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pickle
import graphviz

le = preprocessing.LabelEncoder()

# Get data, separate into features and labels
with open('../data/feature_vectors.obj', 'rb') as testing_file:
    data = pickle.load(testing_file)


data_df = pd.DataFrame.from_records(data, columns=["Party", "Tweet", "sentiment", "language", "dot_product", "vector", "is_retweet", "length_tweet", "num_hashtags", "num_mentions"])
data_df

Unnamed: 0,Party,Tweet,sentiment,language,dot_product,vector,is_retweet,length_tweet,num_hashtags,num_mentions
0,Democrat,"Today, Senate Dems vote to #SaveTheInternet. P...",1.660000,en,27.835464,"[-0.03105460652295733, 0.3615999109315453, 0.3...",False,139,2,0
1,Democrat,RT @WinterHavenSun: Winter Haven resident / Al...,1.420000,en,-6.018233,"[-0.05827691550621239, 0.2105472266111974, -0....",True,140,0,2
2,Democrat,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,1.240000,en,11.033403,"[0.2234409597585909, 0.1639013930107467, 0.352...",True,140,0,2
3,Democrat,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,1.720000,en,-0.447955,"[-0.24786693104397273, 0.4860032578581013, -0....",True,140,1,3
4,Democrat,RT @EmgageActionFL: Thank you to all who came ...,1.733333,en,26.004461,"[0.20108230298446017, 0.02912711470038687, -0....",True,139,0,1
...,...,...,...,...,...,...,...,...,...,...
72620,Republican,What a scary statistic! Americans are working ...,1.540000,en,19.692049,"[0.40658389572308806, -0.037907039832134615, 0...",True,123,0,0
72621,Democrat,I urge @EPAScottPruitt to resign because of hi...,1.286667,en,40.250832,"[0.39530073582068326, 0.1305930108967459, 0.43...",False,143,0,1
72622,Democrat,Awesome! Also hope @Wheeling_Cats will check o...,1.686667,en,16.865290,"[-0.030624291015556082, 0.3155437790628639, 0....",False,140,1,1
72623,Democrat,"RT @CNN: Rep. Adam Schiff, the top Democrat on...",1.540000,en,7.000402,"[0.19569792719232737, 0.05175906507195549, 0.3...",True,140,0,1


#### Encode the data

In [18]:
data_encoded = data_df.copy()
data_encoded["Party"] = le.fit_transform(data_encoded["Party"])
data_encoded["Retweet"] = le.fit_transform(data_encoded["is_retweet"])
data_encoded["language"] = le.fit_transform(data_encoded["language"])
data_encoded

Unnamed: 0,Party,Tweet,sentiment,language,dot_product,vector,is_retweet,length_tweet,num_hashtags,num_mentions,Retweet
0,0,"Today, Senate Dems vote to #SaveTheInternet. P...",1.660000,0,27.835464,"[-0.03105460652295733, 0.3615999109315453, 0.3...",False,139,2,0,0
1,0,RT @WinterHavenSun: Winter Haven resident / Al...,1.420000,0,-6.018233,"[-0.05827691550621239, 0.2105472266111974, -0....",True,140,0,2,1
2,0,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,1.240000,0,11.033403,"[0.2234409597585909, 0.1639013930107467, 0.352...",True,140,0,2,1
3,0,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,1.720000,0,-0.447955,"[-0.24786693104397273, 0.4860032578581013, -0....",True,140,1,3,1
4,0,RT @EmgageActionFL: Thank you to all who came ...,1.733333,0,26.004461,"[0.20108230298446017, 0.02912711470038687, -0....",True,139,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
72620,1,What a scary statistic! Americans are working ...,1.540000,0,19.692049,"[0.40658389572308806, -0.037907039832134615, 0...",True,123,0,0,1
72621,0,I urge @EPAScottPruitt to resign because of hi...,1.286667,0,40.250832,"[0.39530073582068326, 0.1305930108967459, 0.43...",False,143,0,1,0
72622,0,Awesome! Also hope @Wheeling_Cats will check o...,1.686667,0,16.865290,"[-0.030624291015556082, 0.3155437790628639, 0....",False,140,1,1,0
72623,0,"RT @CNN: Rep. Adam Schiff, the top Democrat on...",1.540000,0,7.000402,"[0.19569792719232737, 0.05175906507195549, 0.3...",True,140,0,1,1


#### Split into testing and training data

In [19]:
train_df, test_df = train_test_split(data_encoded, test_size=0.2, random_state=42, shuffle=True)

In [20]:
train_df

Unnamed: 0,Party,Tweet,sentiment,language,dot_product,vector,is_retweet,length_tweet,num_hashtags,num_mentions,Retweet
56059,0,"Bringing on 50,000 new computer science teache...",2.015000,0,12.321572,"[0.23241318799045985, -0.17500857054517382, 0....",True,129,0,0,1
32292,1,Xavier bravely saved another young man's life ...,2.205000,0,8.478309,"[-0.07581671410343915, 0.4409067284748289, 0.0...",False,139,0,0,0
13270,0,@openletterbot @catialada I completely share y...,2.506667,0,5.328323,"[-0.0825056354078697, -0.37020410426703165, -0...",False,140,0,2,0
23351,0,Today is the last day of #OpenEnrollment so do...,1.404000,0,34.265717,"[-0.04632562853088018, 0.32448489299611083, 0....",False,139,1,0,0
10297,0,"In Northwest Oregon, we celebrate the mighty C...",1.385000,0,15.758232,"[-0.6186065866957051, 0.2719376258646662, -0.0...",False,140,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
37194,1,RT @NFIB: Reports of improved earnings reached...,1.765000,0,16.487526,"[-0.13487441587335525, 0.15597572873002719, -0...",True,140,0,2,1
6265,0,"Yesterday I sat down with our Washington, D.C....",1.340000,0,25.976894,"[-0.017384540291232042, 0.07391470867275984, 0...",False,140,0,0,0
54886,0,"The week after 9/11, George W. Bush went to a ...",1.410000,0,34.536224,"[-0.01194908512091164, -0.041027962788463324, ...",False,136,0,0,0
860,0,Good morning! Join me at 7:40am when Rep. Pete...,1.664000,0,18.480972,"[0.02011150051629329, 0.5170896550322936, 0.26...",False,140,0,0,0


In [21]:
test_df

Unnamed: 0,Party,Tweet,sentiment,language,dot_product,vector,is_retweet,length_tweet,num_hashtags,num_mentions,Retweet
61018,1,RT @93wibc: @RepJimBanks discusses the Preside...,1.340000,0,3.502597,"[0.4720800658396911, 0.2966051958537719, -0.26...",True,140,0,3,1
20416,0,RT @HomelandDems: ICYMI - Ranking Members @Ben...,1.030000,0,1.593524,"[0.21594631337211467, 0.08835732201441715, -0....",True,144,0,3,1
43595,1,Congressman Griffith enjoyed watching last nig...,2.073333,0,2.393675,"[-0.12946978604304604, -0.11951498006237671, 0...",False,135,0,0,0
26231,0,President Trump is pushing our country toward ...,2.400000,0,19.899752,"[0.07013527980689105, 0.27208977884129126, 0.2...",False,140,0,0,0
71634,0,Seeing the massive crowds excited to hear Mich...,0.940000,0,15.474901,"[0.06325251617045069, -0.1680865640846605, 0.2...",False,143,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1281,0,#HappyMothersDay to all the moms and honorary ...,1.995000,0,40.250832,"[0.09649022076985148, 0.31389840979989003, 0.0...",False,139,1,0,0
45443,1,UPDATE: The hearing will reconvene at the end ...,0.886667,0,13.317113,"[0.31388538908777264, 0.029376562393736094, 0....",False,73,0,0,0
65141,0,Nurses are at our sides through some of the be...,2.120000,0,11.858605,"[-0.12075630656568137, 0.25943277220335403, 0....",False,140,1,0,0
33239,1,"St. Nickolas, Constantine, &amp; Helen Greek O...",1.073333,0,-3.717728,"[-0.6164123144908444, 0.3801139269600071, 0.03...",False,144,1,0,0


#### Get features of testing and training set

In [22]:
# training set
x_train = train_df[["sentiment", "language", "dot_product", "is_retweet"]]
y_train = train_df[["Party"]]

# testing set
x_test = test_df[["sentiment", "language", "dot_product", "is_retweet"]]
y_test = test_df[["Party"]]

#### Build ID3 tree using entropy

In [26]:
clf_entropy = tree.DecisionTreeClassifier(criterion="entropy")
clf_entropy.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy')

#### Display Tree

In [30]:
dot_data_entropy = tree.export_graphviz(clf_entropy, out_file=None,
                                feature_names=["sentiment", "language", "dot_product", "is_retweet"],
                                class_names=["Democrat", "Republican"],
                                filled=True, rounded=True,
                                special_characters=True)  
graph_entropy = graphviz.Source(dot_data_entropy)
graph_entropy

KeyboardInterrupt: 

In [27]:
# Print Accuracy
y_pred_entropy = clf_entropy.predict(x_test)
print("ID3 Tree Results")
print("Predicted:", y_pred_entropy)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_entropy))

ID3 Tree Results
Predicted: [0 0 1 ... 1 1 0]
Accuracy: 0.5275043029259897


#### Build tree using Gini

In [28]:
clf_gini = tree.DecisionTreeClassifier()
clf_gini.fit(x_train, y_train)

DecisionTreeClassifier()

#### Display Gini Tree

In [None]:
dot_data_gini = tree.export_graphviz(clf_gini, out_file=None,
                                feature_names=["sentiment", "language", "dot_product", "is_retweet"],
                                class_names=["Democrat", "Republican"],
                                filled=True, rounded=True,
                                special_characters=True)  
graph_gini = graphviz.Source(dot_data_gini)


In [29]:
# Print Accuracy
y_pred_gini = clf_gini.predict(x_test)
print("Gini Tree Results")
print("Predicted:", y_pred_gini)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_gini))

Gini Tree Results
Predicted: [0 0 1 ... 1 1 0]
Accuracy: 0.5344578313253012


In [32]:
# Random Forest
clf_rf = RandomForestClassifier(max_depth=7, min_samples_split=0.1, min_samples_leaf=0.05)
clf_rf.fit(x_train, y_train.values.ravel())
y_pred_rf = clf_rf.predict(x_test)
print("Predicted:", y_pred_rf)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_rf))

Predicted: [1 0 1 ... 1 0 1]
Accuracy: 0.5224784853700516
