In [1]:
#Import libraries
import nltk
import re
import pandas as pd
import pickle

from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
#nltk.download('twitter_samples', 'wordnet', 'averaged_perceptron_tagger')

#uncomment above statement if running on a new machine

In [3]:
positive_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tokens = twitter_samples.tokenized('negative_tweets.json')


In [4]:
#Custom functions
def clean_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    clean_list = []
    for token, tag in pos_tag(tokens):
        token = token.lower()
        token = re.sub(r'@[a-z0-9_]\S+', '', token)
        token = re.sub(r'#[a-z0-9_]\S+', '', token)
        token = re.sub(r'&[a-z0-9_]\S+', '', token)
        token = re.sub(r'[?!.+,;$£%&"]+', '', token)
        token = re.sub(r'rt[\s]+', '', token)
        token = re.sub(r'\d+', '', token)
        token = re.sub(r'\$', '', token)
        token = re.sub(r'rt+', '', token)
        token = re.sub(r'https?:?\/\/\S+', '', token)
        if tag.startswith('NN'):
            position = 'n'
        elif tag.startswith('VB'):
            position = 'v'
        elif tag.startswith('RB'):
            position = 'r'
        elif tag.startswith('JJ'):
            position = 'a'
        else:
            position = 'n'

        clean_list.append(lemmatizer.lemmatize(token, pos = position))
        clean_list = [i for i in clean_list if i not in stop_words and len(i) > 0 and i != ':']

    return clean_list

def data_prepare(tokens, status):
    featureset = [(tweet, status) for tweet in tokens]
    return featureset

def featureset_prepare():
    positive_featureset = data_prepare(list(map(clean_tokens, positive_tokens)), 'Positive')
    negative_featureset = data_prepare(list(map(clean_tokens, negative_tokens)), 'Negative')
    featureset = positive_featureset + negative_featureset

    features = []
    labels = []

    for x in featureset:
        features.append(x[0])
        labels.append(x[1])

    return features, labels

def train_model():
    rf_vectorizer = TfidfVectorizer(ngram_range = (1, 2), sublinear_tf = True, max_features = 3000, preprocessor = ' '.join)
    vectorized_features = rf_vectorizer.fit_transform(featureset_prepare()[0])

    X_train, X_test, y_train, y_test = train_test_split(vectorized_features, featureset_prepare()[1], test_size = 0.15, shuffle = True)
    rf_classifier = RandomForestClassifier(n_estimators = 200)
    rf_classifier = rf_classifier.fit(X_train, y_train)
    pickle.dump(rf_classifier, open('rf_classifier', 'wb'))
    pickle.dump(rf_vectorizer, open('rf_vectorizer', 'wb'))
    print('Successfully saved classifier and vectorizer')
    

In [5]:
#Execute
train_model()

Successfully saved classifier and vectorizer
