In [1]:
import os
import pandas as pd
import numpy as np

## CONVERTING CSV FILES INTO TXT FOR INPUT TO TWARC

In [17]:
data=pd.read_csv('IEEE dataset/ORIGINAL DATA/april28-may29.csv',header=None)

In [18]:
data.shape

(46059, 2)

In [19]:
data=data.values

In [20]:
ids=[int(i) for i in data[:,0]]
ids=np.asarray(ids)
sent_score=np.asarray(data[:,1])
df=list(zip(ids,sent_score))

In [21]:
df[:10]

[(1254995208888094720, 0.0),
 (1254995485452050432, 0.0),
 (1254995705527177216, 0.525),
 (1254995868333289472, 0.28750000000000003),
 (1254996123514789888, 0.0),
 (1254996305082036224, 0.313973063973064),
 (1254996436162277376, -0.05),
 (1254996786105790464, 0.0),
 (1254997859285508096, 0.0),
 (1254998021722390528, 0.0)]

In [22]:
d=dict([key,value] for (key,value) in df)

In [23]:
len(d)

46037

In [9]:
out_file=open('IEEE dataset/TWEET IDS/april28-may29.txt','w',encoding='utf8')
for i in ids:
    print((i),file=out_file)
out_file.close()

## MY PREPROCESSOR

* LowerCasing- The tweet is first converted into lower case
* Removal of Retweet|RT|via
* Removal of mentions(like @xyz)
* Removal of links in the tweet
* Negation handling
* Tokenisation to get standard english words only
* Stop word removal - the stopwords were modified
* Lemmatization using WordNetLemmatizer

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [3]:
tok=RegexpTokenizer(r'[a-zA-Z]+')     #taking only alphabets
sw=stopwords.words("english")
lm=WordNetLemmatizer()

In [4]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [5]:
sw.remove('not')

In [6]:
sw.remove('no')

In [7]:
len(sw)

177

In [8]:
negations=["isn't","aren't","wasn't","weren't","won't", "wouldn't","ain't","doesn't","don't","didn't", "can't","couldn't", "doesn't", "hadn't", "hasn't", "haven't", "mightn't", "mustn't", "needn't", "shan't", "shouldn't"]

In [9]:
def get_clean_data(text):
    text=text.lower()
    #removing retweets
    text=re.sub(r'(rt|RT|retweet|from|via)',"",text)
    
    #removing mentions 
    mentions=re.findall("@[A-Za-z0-9_]+",text)
    for m in mentions:
        text=text.replace(m,"")
        
    #links removal
    urls=re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    for u in urls:
        text=text.replace(u,"")
        
    #handling negations- REPLACING ALL THE NEGATIVE HELPING VERBS WITH NOT
    for w in negations:
        if text.find(w)!=-1:
            text=text.replace(w,'not')
            
    #tokenisation
    tokens=tok.tokenize(text)      #takes only standard english alphabets
    #stopword removal
    new_tokens=[t for t in tokens if t not in sw and len(t)>2]
    #stemming
    clean_text=[lm.lemmatize(t) for t in new_tokens]
    
    clean_text=" ".join(clean_text)
    return clean_text

In [10]:
get_clean_data("""RT@abc, @ghs: Don't worry, this pandemic will end soon
Wash your hands at regular itervals, Do not a step outside without wearing a mask :)
On seeing any of the following symptoms: dry cough, high fever, breathing difficulty, loss of taste, loss of smell, visit nearby hospital
For more details visit:'http://tnp.dtu.ac.in/rm_2016-17/intern/intern_login,
 'https://icmr.gov.in',
 'http://www.google.com/egfbhsd/g
#IndiafightsCorona""")

'not worry pandemic end soon wash hand regular itervals not step outside without wearing mask seeing following symptom dry cough high fever breathing difficulty loss taste loss smell visit nearby hospital detail visit indiafightscorona'

In [11]:
get_clean_data("""RT @ashokgehlot51: This message is universal . Stay home, stay safe - let’s Defeat Corona together #राजस्थान_सतर्क_है
RT @MakedaMorrison: Day 5: We have rediscovered farming
RT @_rohanverma: I am the CEO of @MapmyIndia - through https://t.co/p8Iqtoz77t all can find nearby corona testing, treatment &amp; isolation ce…
RT @jabinbotsford: Close up of President @realDonaldTrump notes is seen where he crossed out "Corona" and replaced it with "Chinese" Virus…
This corona virus got me texting people back now lol.
RT @Biancaixvi: Corona day 3: it just feels like Sunday again and...again
house. he will intimidate the corona https://t.co/KXrYmAS25H
RT @chandlerriggs: here’s a deleted scene from TWD’s special on corona virus https://t.co/qGuSfyjpK5
RT @suliceu99: our captain never change :") https://t.co/47gGJdiIvs""")

'message universal stay home stay safe let defeat corona together day rediscovered farming ceo find nearby corona testing treatment amp isolation close president note seen crossed corona replaced chinese virus corona virus got texting people back lol corona day feel like sunday house intimidate corona deleted scene twd special corona virus captain never change'

## HYDRATING IDS USING TWARC

In [24]:
from twarc import Twarc

In [25]:
from twitter_credentials import *

In [26]:
t=Twarc(CONSUMER_KEY,CONSUMER_SECRET,ACCESS_TOKEN,ACESS_TOKEN_SECRET)

In [27]:
type(t)

twarc.client.Twarc

In [35]:
processed_tweet=open('IEEE dataset/PROCESSED DATA/new_april28-may29.txt','w',encoding='utf8')
for tweet in t.hydrate(open('IEEE dataset/TWEET IDS/april28-may29.txt')):
    txt=str(tweet['full_text'])
    t_id=int(tweet['id'])
    sent=d[t_id]
    clean_txt=get_clean_data(txt)
    #if clean_txt!=None:
    print((sent),end=" ",file=processed_tweet)
    print((clean_txt),file=processed_tweet)

In [32]:
processed_tweet.close()

In [33]:
processed_tweet=open('IEEE dataset/PROCESSED DATA/new_april28-may29.txt','r',encoding='utf8')
lines=processed_tweet.readlines()

In [34]:
len(lines)

22493