In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import gensim
from gensim.similarities import WmdSimilarity
import time

In [2]:
def import_data(data):
    df = pd.read_csv(data, index_col=0)
    df = pd.DataFrame(df['text'])
    df = df.drop_duplicates()
    df = df.reset_index().drop(columns=['index'])
    return df

In [3]:
data = "tweets.csv"
df = import_data(data)
df.head()

Unnamed: 0,text
0,Here is my statement.pic.twitter.com/WAZiGoQqMQ
1,Is this really America? Terrible!pic.twitter.c...
2,The media and establishment want me out of the...
3,Certainly has been an interesting 24 hours!
4,Debate polls look great - thank you!\n#MAGA #A...


# Pre-Process the data

In [4]:
def preprocess(text, stem=False):
    TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    stop_words = stopwords.words("english")
    stemmer = SnowballStemmer("english")
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [5]:
def split_doc(df_clean):
    documents = [text.split() for text in df_clean]
    return documents

In [6]:
def model(documents):
    w2v_model = gensim.models.word2vec.Word2Vec(size=300, window=7, min_count=10, workers=8)
    w2v_model.build_vocab(documents)
    w2v_model.train(documents, total_examples=len(documents), epochs=35)
    return w2v_model

In [7]:
def similarity(df_clean, w2v_model):
    instance = WmdSimilarity(df_clean, w2v_model, num_best=20)
    return instance

In [8]:
def top_tweets(nb_top, text):
    clean_text = preprocess(text)
    similarities = instance[clean_text]
    
    rows, cols = (nb_top, 2) 
    results = [[0 for i in range(cols)] for j in range(rows)]

    print('Your text:')
    print(text)
    for i in range(nb_top):
        print('\n\nsimilarities = %.4f' % similarities[i][1])
        print(df.text[similarities[i][0]])
        results[i][0] = df.text[similarities[i][0]]
        results[i][1] = similarities[i][1]
    
    return results

In [11]:
df_clean = df.text.apply(lambda x: preprocess(x))
documents = split_doc(df_clean)
w2v_model = model(documents)
instance = similarity(df_clean, w2v_model)

nb_top = 20
text = "@Scott__Marx  Can't wait to hear @realDonaldTrump run for president and fix our country! #TRUMP2016"
results = top_tweets(nb_top, text)

Your text:
@Scott__Marx  Can't wait to hear @realDonaldTrump run for president and fix our country! #TRUMP2016


similarities = 0.9494
"@Scott__Marx  Can't wait to hear @realDonaldTrump run for president and fix our country! #TRUMP2016"


similarities = 0.8900
"@tristanmf: @realDonaldTrump Trump for president ! Trump for America ! #Trump2016"


similarities = 0.8836
"@zrill: @realDonaldTrump just confirm your intention and lets put the matter to rest, will you run? #Trump2016"  Watch.


similarities = 0.8820
"@RealSQUEZZ: @Realdonaldtrump IS A WINNER, THAT'S WHY HE WILL RUN FOR PRESIDENCY!!!!!!! #Trump2016 FOR SURE!!!!"


similarities = 0.8613
"@NattieBright: @realDonaldTrump If u r so disappointed at Obama why don't u run for president in 2016!"  Watch!


similarities = 0.8582
"@rramz1979: @realDonaldTrump Why don't you run for US president?" Watch.


similarities = 0.8536
"@barrman: @realDonaldTrump if you run for president in 2016 you would have my vote !"


similarities = 0.8479
"@

In [12]:
#import pickle
#pickle.dump(instance, open("instance3.pkl", "xb"), protocol=3)