In [1]:
## Packages need for data pre-process
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

from scipy import sparse
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

import itertools

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [16]:
# Import Dataset
df = pd.read_csv('Twitter_mani.csv')
del df['Unnamed: 0']

In [17]:
# Convert to lowercase and convert to list
data = df.text.str.lower().values.tolist()

In [18]:
# Define basic pre-process function 
def preProcessingFcn(tweet,removeAt=True, removeHashtags=True, removeNewline=True, removeURL=True, 
    removeNumbers=True):
    
    tweet = re.sub(r"\\n", " ", tweet)
    tweet = re.sub(r"&amp", " ", tweet)
    
    if removeAt == True:
        tweet = re.sub("@", "", tweet)
    if removeHashtags == True:
        tweet = re.sub("#", "", tweet)
    if removeNewline == True:
        tweet = re.sub("\s+", " ", tweet)
    if removeURL == True:
        tweet = re.sub(r"http\S+", "", tweet)
    if removeNumbers == True:
        tweet=  ''.join(i for i in tweet if not i.isdigit())
        
    return tweet

In [19]:
data = [preProcessingFcn(tweet) for tweet in data]

In [20]:
# simple_preprocess() tokenies the text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

# Remove Stop Words
stop_words = stopwords.words('english')

data_words_unigrams = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_words]

# If Stemming

In [8]:
data = []
for i in data_words_unigrams:
    tweet = ' '.join(i)
    data.append(tweet)
    
def stemming(tweet, stem=True):
    ps = PorterStemmer()

    if stem==True:
        tweet = ' '.join([ps.stem(word) for word in tweet.split()])
    return tweet

data_stemming = [stemming(tweet) for tweet in data]

data_stemming1 = []
for i in data_stemming:
    alist = i.split()
    data_stemming1.append(alist)
    
data_stemming = data_stemming1

In [9]:
# Count unique words
merged = list(itertools.chain.from_iterable(data_stemming))
print(len(merged))
print(len(set(merged)))

# Identify words that appears at least 20 times
c = Counter(merged)
a = list(Counter({k: c for k, c in c.items() if c >= 20}).keys())

# Select words that appears at least 20 times
for i,value in enumerate(data_stemming):
    data_stemming[i] = [i for i in value if i in a] 
    
# Check
merged = list(itertools.chain.from_iterable(data_stemming))
print(len(set(merged)))

# Get the index of the doc that are deleted
empty_idx = []

for i, value in enumerate(data_stemming):
    if any(value) == False:
        empty_idx.append(i)
len(empty_idx)

# Delete empty elements
data_stemming2 = list(filter(None, data_stemming))
print(len(data_stemming2))

141438
10242
1321
12439


In [10]:
# Create Dictionary
id2word = corpora.Dictionary(data_stemming2)

# Create Corpus
texts = data_stemming2

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [11]:
a_s = gensim.matutils.corpus2dense(corpus, num_terms = 1321)

# Create Doc-word matrix
b_s = a_s.T.astype(np.float64)
print(b_s.shape)
print(b_s)

(12439, 1321)
[[1. 1. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
# Extract Document index
selected_idex = [x for x in list(df.index) if x not in empty_idx]

# Obtain remaining terms
words = [] 
for i,j in enumerate(id2word):
    a = id2word[i]
    words.append(a)

# Create a dataframe
b_ss = pd.DataFrame(b_s, columns=words, index=selected_idex)

In [148]:
#np.savetxt('doc_word_matrix_stemmingf.csv', b_ss, delimiter=',')
#b_ss.to_csv("doc_word_matrix_stemmingf.csv")

# If Lemmatization

In [14]:
# Define functions for lemmatization
def lemmatization(texts):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])
    return texts_out

nlp = spacy.load('en_core_web_sm')

# Do lemmatization 
data_lemmatized = lemmatization(data_words_unigrams) 

In [15]:
# Count unique words
merged = list(itertools.chain.from_iterable(data_lemmatized))
print(len(merged))
print(len(set(merged)))

# Identify words that appears at least 20 times
c = Counter(merged)
a = list(Counter({k: c for k, c in c.items() if c >= 20}).keys())

# Select words that appears at least 20 times
for i,value in enumerate(data_lemmatized):
    data_lemmatized[i] = [i for i in value if i in a] 
    
# Check
merged = list(itertools.chain.from_iterable(data_lemmatized))
print(len(set(merged)))

# Get the index of the doc that are deleted
empty_idx = []

for i, value in enumerate(data_lemmatized):
    if any(value) == False:
        empty_idx.append(i)
len(empty_idx)

# Delete empty elements
data_lemmatized2 = list(filter(None, data_lemmatized))
print(len(data_lemmatized2))

141670
11762
1289
12423


In [21]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized2)

# Create Corpus
texts = data_lemmatized2

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [22]:
a_l = gensim.matutils.corpus2dense(corpus, num_terms = 1289)
b_l = a_l.T

# Create Doc-word matrix
b_l = a_l.T.astype(np.float64)
print(b_l.shape)
print(b_l)

(12423, 1289)
[[1. 1. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [23]:
# Extract Document index
selected_idex_l = [x for x in list(df.index) if x not in empty_idx]

# Obtain remaining terms
words_l = [] 
for i,j in enumerate(id2word):
    a = id2word[i]
    words_l.append(a)

# Create a dataframe
b_ll = pd.DataFrame(b_l, columns=words_l, index=selected_idex_l)

In [151]:
#np.savetxt('doc_word_matrix_lemmatizedf.csv', b_ll, delimiter=',')
#b_ll.to_csv("doc_word_matrix_lemmatizedf.csv")