## Imports and prerequisites

In [12]:
# NLP functions
from nlpfunctions import clean_text, spacy_lemmatizer, remove_nonenglish

# Basic imports
import re
import string
import pandas as pd
import datetime as dt

# NLP packages
import spacy
import nltk

# Stopwords
from sklearn.feature_extraction import _stop_words

# Display
from tqdm import tqdm
pd.options.mode.chained_assignment = None
pd.set_option('display.max_colwidth', None)

# Set progress bar from tqdm
tqdm.pandas(position=0, colour='green')

In [13]:
# Load Spacy  model
nlp = spacy.load('C:/Users/ihakk/AppData/Local/Packages/PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0/\
LocalCache/local-packages/Python310/site-packages/en_core_web_lg/en_core_web_lg-3.4.0', disable=['parser', 'ner'])

In [14]:
# Get stopwords list for sklearn
stopwords = list(_stop_words.ENGLISH_STOP_WORDS)

# Modify the stop word list
remove_stop = ['not', 'without', 'would', 'never', 'neither', 'hasnt', 'couldnt', 'no', 'against']
add_stop = []
stopwords = [word for word in stopwords if word not in remove_stop]
stopwords.extend(add_stop)

# import the NLTK English corpus
english_words = set(nltk.corpus.words.words())

# Load the lexicon for lemmas
lexicon = list(pd.read_csv('train.csv')['keyword'].unique())

# Update the corpus with the Lexicon
english_words.update(lexicon)

## Importing the data

In [19]:
# Loading the data
df = pd.read_pickle('clean_data_1608221436.pkl')
df_test = pd.read_pickle('clean_data_1608221848.pkl')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            7613 non-null   int64 
 1   text          7613 non-null   object
 2   target        7613 non-null   int64 
 3   cleaned_text  7613 non-null   object
 4   lemmas        7613 non-null   object
 5   lemmas_small  7613 non-null   object
dtypes: int64(2), object(4)
memory usage: 357.0+ KB


## Pre-processing

In [49]:
df['cleaned_text'] = df['text'].progress_apply(clean)
df['lemmas'] = df['cleaned_text'].progress_apply(spacy_lemmatizer, nlp=nlp, stopwords=stopwords)
df['lemmas_small'] = df['lemmas'].progress_apply(remove_nonenglish, corpus=english_words)

df_test['cleaned_text'] = df_test['text'].progress_apply(clean)
df_test['lemmas'] = df_test['cleaned_text'].progress_apply(spacy_lemmatizer, nlp=nlp, stopwords=stopwords)
df_test['lemmas_small'] = df_test['lemmas'].progress_apply(remove_nonenglish, corpus=english_words)

# Create a file with currect date
current_date = dt.date.today().strftime(format='%d%m%y')
current_time = dt.datetime.now().strftime(format='%H%M')
filename = f'clean_data_{current_date}{current_time}.pkl'

# Saved the df to pickle
df.to_pickle('clean_train.pkl')
df_test.to_pickle('clean_test.pkl')
print('Data saved as : ', filename)


100%|[32m██████████[0m| 7613/7613 [00:00<00:00, 8505.49it/s]
100%|[32m██████████[0m| 7613/7613 [00:47<00:00, 161.79it/s]
100%|[32m██████████[0m| 7613/7613 [00:00<00:00, 59310.73it/s]
100%|[32m██████████[0m| 3263/3263 [00:00<00:00, 6672.64it/s]
100%|[32m██████████[0m| 3263/3263 [00:18<00:00, 172.15it/s]
100%|[32m██████████[0m| 3263/3263 [00:00<00:00, 192033.20it/s]


Data saved as :  clean_data_1808221051.pkl


In [50]:
df.text.iloc[121]

'Aftershock: Protect Yourself and Profit in the Next Global Financial Meltdown by David Wiedemer http http://t.co/WZTz4hgMVq'

In [51]:
df['cleaned_text'].iloc[121]

'aftershock protect yourself and profit the next global financial meltdown david wiedemer'