In [37]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.util import ngrams
from nltk.probability import FreqDist
import re
import inflect

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# import training set and test set
# encoding latin1 o.w. unicode decode error
train_set = pd.read_csv('Corona_NLP_train.csv', encoding='latin1')
test_set = pd.read_csv('Corona_NLP_test.csv', encoding='latin1')

# drop duplicates from dataframe
train_set.drop_duplicates(keep=False,inplace=True) 
test_set.drop_duplicates(keep=False,inplace=True) 

train_set.info()

# % of missing values per column
train_set.isnull().sum()/len(train_set)*100
test_set.isnull().sum()/len(train_set)*100

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8055 entries, 0 to 8054
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       8055 non-null   int64 
 1   ScreenName     8055 non-null   int64 
 2   Location       6469 non-null   object
 3   TweetAt        8055 non-null   object
 4   OriginalTweet  8055 non-null   object
 5   Sentiment      8054 non-null   object
dtypes: int64(2), object(4)
memory usage: 440.5+ KB


UserName          0.000000
ScreenName        0.000000
Location         10.353818
TweetAt           0.000000
OriginalTweet     0.000000
Sentiment         0.000000
dtype: float64

In [7]:
train_set['Sentiment'].value_counts(normalize=True) * 100

Positive              26.794140
Negative              25.130370
Neutral               17.717904
Extremely Negative    15.309163
Extremely Positive    15.048423
Name: Sentiment, dtype: float64

In [44]:
# preprocessing 'OriginalTweet' column
stops = set(stopwords.words("english"))

def replace_numbers(words):
    p = inflect.engine()
    text = []
    for word in words:
        if word.isdigit():
            new = p.number_to_words(word)
            text.append(new)
        else:
            text.append(word)
    return text

def preprocess(x):
  # remove urls  
  x = re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE)
  # remove https
  x = re.sub(r'https?://\S+', '', x)
  # remove noise
  x = re.sub('[^a-z\s]', '', x.lower()) 
  # remove punctuations
  x = re.sub(r'[^\w\s]', '', x)
  # remove hashtags and mentions
  x = x.replace("#", "").replace("_", " ").replace("@", " ")
  # remove stopwords              
  x = [w for w in x.split() if w not in stops] 
  # replace numbers with textual representations
  x = replace_numbers(x)
  return ' '.join(x)                                   

In [45]:
train_set['CleanTweet'] = train_set['OriginalTweet'].apply(preprocess) 
train_set['CleanTweet']

0                             menyrbie philgahan chrisitv
1       advice talk neighbours family exchange phone n...
2       coronavirus australia woolworths give elderly ...
3       food stock one empty please dont panic enough ...
4       ready go supermarket covid outbreak im paranoi...
                              ...                        
8050    hit grocery store early attempt find one elusi...
8051    home affairs minister peter dutton says joint ...
8052    well never get weightloss side effect anything...
8053    nigel balmaingourmet agree wholeheartedly coul...
8054    ebay going anything absolutely disgusting pric...
Name: CleanTweet, Length: 8055, dtype: object

In [33]:
# most common words
from collections import Counter
Counter(" ".join(train_set["CleanTweet"]).split()).most_common(20)

[('covid', 4226),
 ('coronavirus', 3993),
 ('store', 1933),
 ('food', 1748),
 ('grocery', 1724),
 ('people', 1319),
 ('supermarket', 1302),
 ('prices', 1127),
 ('amp', 933),
 ('panic', 910),
 ('consumer', 866),
 ('shopping', 836),
 ('online', 782),
 ('get', 671),
 ('need', 655),
 ('us', 549),
 ('buying', 544),
 ('like', 529),
 ('stock', 490),
 ('pandemic', 470)]

In [43]:
# Ngrams (bigram and trigram)
train_set['bigrams'] = train_set['CleanTweet'].apply(lambda row: list(nltk.bigrams(row.split(' '))))
train_set['trigrams'] = train_set['CleanTweet'].apply(lambda row: list(nltk.trigrams(row.split(' '))))
train_set['bigrams']

0          [(menyrbie, philgahan), (philgahan, chrisitv)]
1       [(advice, talk), (talk, neighbours), (neighbou...
2       [(coronavirus, australia), (australia, woolwor...
3       [(food, stock), (stock, one), (one, empty), (e...
4       [(ready, go), (go, supermarket), (supermarket,...
                              ...                        
8050    [(hit, grocery), (grocery, store), (store, ear...
8051    [(home, affairs), (affairs, minister), (minist...
8052    [(well, never), (never, get), (get, weightloss...
8053    [(nigel, balmaingourmet), (balmaingourmet, agr...
8054    [(ebay, going), (going, anything), (anything, ...
Name: bigrams, Length: 8055, dtype: object

In [42]:
#from sklearn.feature_extraction.text import CountVectorizer
#word_vectorizer = CountVectorizer(ngram_range=(1,2), analyzer='word')
#sparse_matrix = word_vectorizer.fit_transform(train_set['CleanTweet'])
#frequencies = sum(sparse_matrix).toarray()[0]
#pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])