In [21]:
import numpy as np
import pandas as pd

In [22]:
train_data = pd.read_csv('./data/twitter_training.csv')
test_data = pd.read_csv('./data/twitter_validation.csv')

train_data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [25]:
train_data.rename(columns={
    train_data.columns[0] : 'id', 
    train_data.columns[1] : 'entity', 
    train_data.columns[2] : 'sentiment', 
    train_data.columns[3] : 'content'
    }, inplace=True)

In [26]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74681 non-null  int64 
 1   entity     74681 non-null  object
 2   sentiment  74681 non-null  object
 3   content    73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [27]:
train_data.isnull().sum()

id             0
entity         0
sentiment      0
content      686
dtype: int64

In [28]:
df = train_data.dropna()

In [29]:
df

Unnamed: 0,id,entity,sentiment,content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [30]:
df.shape

(73995, 4)

In [31]:
df.isna().sum()

id           0
entity       0
sentiment    0
content      0
dtype: int64

## Manipulate content to lower case

In [32]:
df.apply(lambda x: x.astype(str).str.lower())

Unnamed: 0,id,entity,sentiment,content
0,2401,borderlands,positive,i am coming to the borders and i will kill you...
1,2401,borderlands,positive,im getting on borderlands and i will kill you ...
2,2401,borderlands,positive,im coming on borderlands and i will murder you...
3,2401,borderlands,positive,im getting on borderlands 2 and i will murder ...
4,2401,borderlands,positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,nvidia,positive,just realized that the windows partition of my...
74677,9200,nvidia,positive,just realized that my mac window partition is ...
74678,9200,nvidia,positive,just realized the windows partition of my mac ...
74679,9200,nvidia,positive,just realized between the windows partition of...


## Remove Punctuation

In [33]:
df['text'] = df['content'].str.replace('[^\w\s]','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['content'].str.replace('[^\w\s]','')


In [34]:
df.head()

Unnamed: 0,id,entity,sentiment,content,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...


## Tokenized Text

In [36]:
import nltk
nltk.download('punkt')
df['tokenized_text'] = df.apply(lambda x: nltk.word_tokenize(x['text']), axis=1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokenized_text'] = df.apply(lambda x: nltk.word_tokenize(x['text']), axis=1)


In [39]:
df

Unnamed: 0,id,entity,sentiment,content,text,tokenized_text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,I am coming to the borders and I will kill you...,"[I, am, coming, to, the, borders, and, I, will..."
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you ...,"[im, getting, on, borderlands, and, i, will, k..."
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...,"[im, coming, on, borderlands, and, i, will, mu..."
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...,"[im, getting, on, borderlands, 2, and, i, will..."
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...,"[im, getting, into, borderlands, and, i, can, ..."
...,...,...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...,Just realized that the Windows partition of my...,"[Just, realized, that, the, Windows, partition..."
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,Just realized that my Mac window partition is ...,"[Just, realized, that, my, Mac, window, partit..."
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,Just realized the windows partition of my Mac ...,"[Just, realized, the, windows, partition, of, ..."
74679,9200,Nvidia,Positive,Just realized between the windows partition of...,Just realized between the windows partition of...,"[Just, realized, between, the, windows, partit..."


In [41]:
type(df[:-1].values)

numpy.ndarray

## Remove Stopwords

In [46]:
from nltk.corpus import stopwords
nltk.download('stopwords')

sw = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [49]:
## Declare converter

def convert_tokenize_to_stopwords(tokenized_text: np.ndarray):
    return ' '.join([word for word in tokenized_text if word not in(sw)])

In [50]:
df['stopwords_text'] = df['tokenized_text'].apply(convert_tokenize_to_stopwords)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stopwords_text'] = df['tokenized_text'].apply(convert_tokenize_to_stopwords)


Unnamed: 0,id,entity,sentiment,content,text,tokenized_text,stopwords_text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,I am coming to the borders and I will kill you...,"[I, am, coming, to, the, borders, and, I, will...","I coming borders I kill ,"
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you ...,"[im, getting, on, borderlands, and, i, will, k...","im getting borderlands kill ,"
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...,"[im, coming, on, borderlands, and, i, will, mu...","im coming borderlands murder ,"
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...,"[im, getting, on, borderlands, 2, and, i, will...","im getting borderlands 2 murder ,"
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...,"[im, getting, into, borderlands, and, i, can, ...","im getting borderlands murder ,"
...,...,...,...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...,Just realized that the Windows partition of my...,"[Just, realized, that, the, Windows, partition...",Just realized Windows partition Mac like 6 yea...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,Just realized that my Mac window partition is ...,"[Just, realized, that, my, Mac, window, partit...",Just realized Mac window partition 6 years beh...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,Just realized the windows partition of my Mac ...,"[Just, realized, the, windows, partition, of, ...",Just realized windows partition Mac 6 years be...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...,Just realized between the windows partition of...,"[Just, realized, between, the, windows, partit...",Just realized windows partition Mac like 6 yea...
