## 2. Twitter Data Cleaning
---

### 2.1: Import expected libraries for processing

___

In [1]:
import pandas as pd
import re
import glob
import numpy as np
import datetime
import random

In [2]:
#timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

### 2.2: Use saved file as Input file for processing 

____

In [3]:
# Define input and output files 
input_file = '../data/raw/tweets.csv'
#input_dir = r'../data/raw/tweets.csv' # use your path
#output_file = '../data/cleaned/WitsTweets-'+timestamp+'.csv'
output_file = '../data/cleaned/WitsTweets.csv'

In [4]:
col_names = ['Date', 'User', 'Tweet', 'Url', 'Location', 'Hashtags']
# all_files = glob.glob(input_dir + "/*.csv")

# li = []

# for filename in all_files:
#     df = pd.read_csv(filename, names = col_names)
#     print(df.shape)
#     li.append(df)

# data = pd.concat(li, axis=0, ignore_index=True)
# print(data.shape)
# data.drop_duplicates(subset = "Url", inplace =True)
# print(data.shape)
# data.head()


### 2.2.1: Basic Text Cleaning

For basic text cleaning we remove user mentions and unnecesary white spaces from the tweets and convert all the text to lowercase.
____

In [5]:
 # Read in Raw Data and Assign Column Names where required
col_names = ['Date', 'User', 'Tweet', 'Url', "Location", 'Hashtags']
data = pd.read_csv(input_file, names=col_names)
#data = pd.DataFrame(col_names = col_names) 
data.head() 

Unnamed: 0,Date,User,Tweet,Url,Location,Hashtags
0,2020-03-05 12:38:42,Deni707,The protests sparked last week by University o...,https://twitter.com/Deni707/status/12355452719...,,
1,2020-03-05 12:38:18,pearl_ndlalane,@Katlego_MS @Yessirmaa -Wits,https://twitter.com/pearl_ndlalane/status/1235...,"Rustenburg, South Africa",
2,2020-03-05 12:37:37,mrlechesa,RT @NotInMyWatch: Students at Wits are going t...,https://twitter.com/mrlechesa/status/123554499...,Johannesburg,
3,2020-03-05 12:37:34,PhilarN_91,RT @Didi_Azania: ATT: Wits students! This is h...,https://twitter.com/PhilarN_91/status/12355449...,"Midrand, South Africa",
4,2020-03-05 12:37:33,Maliwa_Luyanda,RT @Katlego_MS: Brand Ambassadors needed from...,https://twitter.com/Maliwa_Luyanda/status/1235...,PE | CPT,


In [6]:
from nltk.corpus import stopwords
additional  = ['rt','rts','retweet', 'RT']
swords = set().union(stopwords.words('english'),additional)

In [7]:
#swords

In [8]:
data['cleaned_text'] = data['Tweet'].str.lower()\
          .str.replace('(@[a-z0-9]+)\w+',' ')\
          .str.replace('(http\S+)', ' ')\
          .str.replace('([^0-9a-z \t])',' ')\
          .str.replace(' +',' ')\
          .apply(lambda x: [i for i in x.split() if not i in swords])

In [9]:
def convert_list_string(mylist):
    return " ".join(mylist)

In [10]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
data['stemmed'] = data['cleaned_text'].apply(lambda x: [ps.stem(i) for i in x if i != ''])

In [11]:
data['stemmed'].head(n=2)

0    [protest, spark, last, week, univers, kwazulu,...
1                                                [wit]
Name: stemmed, dtype: object

In [12]:
data['cleaned_text'] = data['cleaned_text'].apply(convert_list_string)
data.cleaned_text.head()

0    protests sparked last week university kwazulu ...
1                                                 wits
2    students wits going lot really heart breaking ...
3       att wits students got study overseas awareness
4    brand ambassadors needed following campuses uf...
Name: cleaned_text, dtype: object

In [13]:
data.head()

Unnamed: 0,Date,User,Tweet,Url,Location,Hashtags,cleaned_text,stemmed
0,2020-03-05 12:38:42,Deni707,The protests sparked last week by University o...,https://twitter.com/Deni707/status/12355452719...,,,protests sparked last week university kwazulu ...,"[protest, spark, last, week, univers, kwazulu,..."
1,2020-03-05 12:38:18,pearl_ndlalane,@Katlego_MS @Yessirmaa -Wits,https://twitter.com/pearl_ndlalane/status/1235...,"Rustenburg, South Africa",,wits,[wit]
2,2020-03-05 12:37:37,mrlechesa,RT @NotInMyWatch: Students at Wits are going t...,https://twitter.com/mrlechesa/status/123554499...,Johannesburg,,students wits going lot really heart breaking ...,"[student, wit, go, lot, realli, heart, break, ..."
3,2020-03-05 12:37:34,PhilarN_91,RT @Didi_Azania: ATT: Wits students! This is h...,https://twitter.com/PhilarN_91/status/12355449...,"Midrand, South Africa",,att wits students got study overseas awareness,"[att, wit, student, got, studi, oversea, awar]"
4,2020-03-05 12:37:33,Maliwa_Luyanda,RT @Katlego_MS: Brand Ambassadors needed from...,https://twitter.com/Maliwa_Luyanda/status/1235...,PE | CPT,,brand ambassadors needed following campuses uf...,"[brand, ambassador, need, follow, campus, uf, ..."


In [14]:
# data['stemmed'] = data['stemmed'].apply(convert_list_string)
# data.stemmed.head()

### 2.2.2: Remove Emojis
___

In [15]:
def remove_emojis(input_text):
    return input_text.encode('ascii', 'ignore').decode('ascii')

In [16]:
data.Tweet = data.Tweet.apply(func = remove_emojis)

### 2.2.3: Remove Unnecessary Whitespaces

___

In [17]:
def clean_whitespace(input_text):
    clean_text = input_text.strip()
    return(clean_text)

In [18]:
data.Tweet = data.Tweet.apply(func = clean_whitespace)

### 2.2.4: Replace Newline Character
___

In [19]:
def replace_newline(input_text):
    clean_text = input_text.replace("\n",".")
    return(clean_text)

In [20]:
data.Tweet = data.Tweet.apply(func = replace_newline)

### Save the Processed Corpus to Disk

In [21]:
data.to_csv(output_file, index=False)