In [121]:
import re, string
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import pandas as pd

from pprint import pprint

In [94]:
#we will clean text after reading it in
def clean(line, punctuation, stemmer):
    #remove punctuation
    line = str(line)
    line2 = line.translate(str.maketrans('','', punctuation))

    #stem words
    line3 = [stemmer.stem(word) for word in line2.split()]

    return " ".join(line3)

In [3]:
punctuation = re.sub('[$%<=>]', "", string.punctuation)
stemmer = PorterStemmer()

#read strings from dataset file
filepath = 'financial-news-dataset-master/dataset.txt'

with open(filepath, 'r') as f:
    corpus = [clean(line, punctuation, stemmer) for line in f.readlines()]

In [86]:
reddit_stats = pd.read_excel('statistics_reddit_data.xlsx')

In [5]:
train = pd.read_csv('Sent140data/train.csv',  encoding = "ISO-8859-1", header = None)
test = pd.read_csv('Sent140data/test.csv',  encoding = "ISO-8859-1", header = None)

In [6]:
sent = pd.concat([train, test])

In [8]:
tweets = list(sent[5])

In [97]:
clean_reddit = []

for post in reddit_stats['comment_body']:
    clean_reddit.append([clean(post, punctuation, stemmer), 1])

In [9]:
clean_tweets = []

for tweet in tweets:
    clean_tweets.append(clean(tweet, punctuation, stemmer))

In [11]:
tweets_class = []

for tweet in clean_tweets:
    tweets_class.append([tweet, 0])

In [10]:
len(clean_tweets)

1600498

In [13]:
corpus_class = []

for doc in corpus:
    corpus_class.append([doc, 1])

In [98]:
df= tweets_class + corpus_class + clean_reddit

In [99]:
#make a df with clean_tweets and reuters data
DF = pd.DataFrame(df, columns = ['text','class'])

In [100]:
y = DF['class']

In [101]:
vectorizer = CountVectorizer(max_features = 250,
                             stop_words = 'english')
X = vectorizer.fit_transform(DF['text'])

In [102]:
#for feature extraction
sorted_vocab = sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1])

In [103]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=420)

In [111]:
clf = RandomForestClassifier(n_estimators = 250,
                             max_features= 100, 
                             max_depth=25, n_jobs=-1, random_state=420)

In [112]:
 clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features=100, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
            oob_score=False, random_state=420, verbose=0, warm_start=False)

In [114]:
y_predict = clf.predict(X_test)

In [117]:
#model oos prediction
accuracy_score(y_test, y_predict)
precision_score(y_test, y_predict, average='macro')

0.9926130411261825

In [120]:
#get feature importances, sorted
pprint(sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), sorted_vocab), reverse=True))

[(0.367, ('report', 185)),
 (0.2395, ('said', 195)),
 (0.1457, ('edit', 63)),
 (0.0671, ('percent', 166)),
 (0.033, ('market', 142)),
 (0.0308, ('data', 46)),
 (0.0165, ('compani', 34)),
 (0.0158, ('year', 247)),
 (0.0124, ('billion', 19)),
 (0.0077, ('use', 233)),
 (0.0076, ('thi', 218)),
 (0.0052, ('ha', 98)),
 (0.0038, ('valu', 234)),
 (0.0035, ('financi', 81)),
 (0.0029, ('estim', 65)),
 (0.002, ('gener', 90)),
 (0.0019, ('bank', 13)),
 (0.0018, ('point', 169)),
 (0.0014, ('share', 204)),
 (0.0014, ('result', 186)),
 (0.0013, ('peopl', 165)),
 (0.0012, ('becaus', 14)),
 (0.0011, ('reuter', 187)),
 (0.0009, ('problem', 175)),
 (0.0008, ('wa', 236)),
 (0.0008, ('level', 127)),
 (0.0008, ('industri', 117)),
 (0.0007, ('think', 220)),
 (0.0007, ('say', 197)),
 (0.0007, ('new', 152)),
 (0.0007, ('million', 145)),
 (0.0007, ('mean', 143)),
 (0.0007, ('expect', 71)),
 (0.0007, ('ani', 8)),
 (0.0007, ('addit', 3)),
 (0.0006, ('make', 139)),
 (0.0006, ('like', 128)),
 (0.0006, ('case', 25))