In [None]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import os
from contractions import contractions_dict
import contractions
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from spacy.lang.en import STOP_WORDS as spacy_stopwords
import spacy
from tqdm import tqdm
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import Counter
from scipy.sparse.linalg import svds

In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [3]:
nlp = spacy.load("en_core_web_md",disable=["ner","parser"])

In [4]:
combined_stopwords = set(stopwords.words('english')).union(set(spacy_stopwords))

In [None]:
data = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv",
                  encoding="latin-1",header=None,names=["sentiment","id","date","flag","username","text"])

In [None]:
data.head()

In [None]:
data.drop(labels=data.columns[1:5],axis=1,inplace=True)

In [None]:
data.head()

# We Will Do 7 step filteration 


### Normalization


In [None]:
def normalize_tweet(tweet):

    return tweet.lower()

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(normalize_tweet,data["text"]))

### Contraction Fixing

In [None]:
def fix_contractions(tweet):

    return contractions.fix(tweet)

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(fix_contractions,data["text"]))

### Useless Tokens Removal

In [None]:
def remove_noisy_tokens(tweet):

    return re.sub(pattern=r'@[a-zA-Z0-9 ]+|#[a-zA-Z0-9 ]+|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+',
                 string=tweet,repl=" ")

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(remove_noisy_tokens,data["text"]))

### Leftover Useless Tokens Removal

In [None]:
def remove_remaining_noisy_tokens(tweet):

    return re.sub(pattern=r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',
                 string=tweet,repl=" ")

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(remove_remaining_noisy_tokens,data["text"]))

### Tokenizing The Tokens

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(word_tokenize,data["text"]))

### Stopwards Removal

In [None]:
def is_stopword(token):

    return token not in combined_stopwords

In [None]:
def remove_stopwords(tokenized_tweet):

    return [token for token in tokenized_tweet if is_stopword(token)]

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(pool.map(remove_stopwords,data["text"]))

In [None]:
with open("stopwords_removed.pkl","wb") as file_handle:
    pickle.dump(data["text"],file_handle)

### Lemmatization

In [None]:
def lemmatize_tweet(tokenized_tweet):

    raw_tweet = " ".join(tokenized_tweet)
    doc = nlp(raw_tweet)
    lemmatized_tweet = list()

    for token in doc:
        lemmatized_tweet.append(token.lemma_)

    return lemmatized_tweet

In [None]:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    data["text"] = list(tqdm(pool.map(lemmatize_tweet,data["text"])))

### Saving the result in .pkl file 

In [None]:
with open("lemmatized_tweets.pkl","wb") as file_handle:
    pickle.dump(data["text"],file_handle)

In [5]:
data = pd.DataFrame()

with open("lemmatized_tweets.pkl","rb") as file_handle:
    data["text"] = pickle.load(file_handle)

In [24]:
data.head()

Unnamed: 0,text
0,"[twitpic, com, zl, awww, bummer, shoulda, get,..."
1,"[upset, update, facebook, texte, cry, result, ..."
2,"[manage, save, rest, bound]"
3,"[body, feel, itchy, like, fire]"
4,"[behave, mad]"


In [6]:
converted_raw_text = list(data["text"].apply(lambda x: " ".join(x)))

In [7]:
converted_raw_text = list(filter(lambda x: len(x) > 0,converted_raw_text))

In [8]:
len(converted_raw_text)

1408026

In [9]:
vocab = set()

for cleaned_tweet in converted_raw_text:
    vocab.update(set(cleaned_tweet.split(" ")))

In [10]:
len(vocab)

273488

In [11]:
vectorizer = TfidfVectorizer()
vectorized_text = vectorizer.fit_transform(converted_raw_text)

In [12]:
vectorized_text.shape

(1408026, 273469)

In [13]:
cumulative_tfs = Counter()
for cleaned_tweet in data["text"]:

    cumulative_tfs.update(cleaned_tweet)

In [14]:
cumulative_tfs

Counter({'go': 102402,
         'day': 99448,
         'get': 88925,
         'good': 82767,
         'work': 76169,
         'quot': 72094,
         'like': 68153,
         'want': 64104,
         'love': 63841,
         'today': 62896,
         'time': 57589,
         'amp': 48695,
         'miss': 48293,
         'think': 46344,
         'know': 44912,
         'feel': 44099,
         'lol': 41503,
         'night': 40964,
         'watch': 38933,
         'need': 38403,
         'new': 38112,
         'home': 37841,
         'come': 36888,
         'look': 33425,
         'tomorrow': 32953,
         'sleep': 32023,
         'hope': 31419,
         'twitter': 30627,
         'morning': 29774,
         'thank': 29229,
         'wait': 28248,
         'bad': 28089,
         'great': 27781,
         'wish': 27496,
         'sad': 25608,
         'way': 25495,
         'week': 25139,
         'oh': 24901,
         'fun': 24094,
         'tonight': 23524,
         'happy': 23272,
       

In [15]:
most_frequent_tokens = cumulative_tfs.most_common(30000)
most_frequent_tokens = dict(most_frequent_tokens)
truncated_vocab = list(most_frequent_tokens.keys())

truncated_vocab2idx = dict(zip(truncated_vocab,range(len(truncated_vocab))))

In [17]:
vectorizer = TfidfVectorizer(vocabulary=truncated_vocab2idx)
vectorized_text = vectorizer.fit_transform(converted_raw_text)



In [22]:
vectorized_text.shape

(1408026, 30000)

In [28]:
Q, S, QT = svds(vectorized_text)

In [None]:
Q,S,QT

In [33]:
print("Q shape:",Q.shape)
print("S shape:",S.shape)
print("QT shape:",QT.shape)

Q shape: (1408026, 6)
S shape: (6,)
QT shape: (6, 30000)
