In [None]:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
from nltk.stem.porter import PorterStemmer

In [14]:
data = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None, encoding='latin-1')

# Preprocessing

In [210]:
def clean_sentence(text):
    # Remove links
    text = re.sub(r"http\S+", "", text)
    # Remove @ symbols and patterns in general
    text = re.sub("@[\w]*", '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stop-words
    text_tokens = word_tokenize(text)
    text = ' '.join([word.lower() for word in text_tokens if not word in stopwords.words() and word.isalpha()])
    # Stemming
    text = [stemmer.stem(i) for i in word_tokenize(text)]
    return text

In [198]:
from pandarallel import pandarallel
pandarallel.initialize()
stemmer = PorterStemmer()
data[6] = data[5].parallel_apply(clean_sentence)
data.to_csv("tweets.csv", sep='\t', encoding='latin-1')

INFO: Pandarallel will run on 72 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 299 ms, sys: 1.65 s, total: 1.95 s
Wall time: 21.9 s


# Feature Extraction

In [201]:
data = pd.read_csv("tweets.csv", sep='\t', header=None, encoding='latin-1')

###  Tf-idf Vectors

In [216]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [239]:
x_tfidf = vectorizer.fit_transform(data.iloc[:, 7])

In [252]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

from sklearn.ensemble import RandomForestClassifier

In [265]:
pipeline = Pipeline([
    ('bow',CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', RandomForestClassifier(max_depth=20, n_jobs=-1, verbose=1)),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
msg_train, msg_test, label_train, label_test = train_test_split(data[7], data[1], test_size=0.2)
pipeline.fit(msg_train,label_train)
predictions = pipeline.predict(msg_test)
print(classification_report(predictions,label_test))
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))