<a href="https://colab.research.google.com/github/kiran-collab/COVID_Tweet_NER/blob/main/Twitter_COVID_Tweets_Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from itertools import groupby

from nltk.corpus import stopwords

re_url = r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def clean_text(text):
    '''Make text lowercase, remove reply, remove text in square brackets, remove links, remove user mention,
    remove punctuation, remove numbers and remove words containing numbers.'''
        
    text = text.lower()
    text = re.sub('^rt', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub(re_url, '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('@\w+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

def get_consecutive_chars(text):
    ''' Count how many consecutive chars, consecutive upper chars and consecutive punctuation'''
    result = [(label, sum(1 for _ in group)) for label, group in groupby(text)]
    
    consecutive_chars = 0
    consecutive_chars_upper = 0
    consecutive_punctuations = 0
    
    for i in result:
        if i[1] > 1:
            if i[0] in string.punctuation:
                consecutive_punctuations += i[1]
            elif i[0].upper() == i[0]:
                consecutive_chars_upper += i[1]
            else:
                consecutive_chars += i[1]
                
    return {
        'qtd_consecutive_chars' : consecutive_chars,
        'qtd_consecutive_chars_upper': consecutive_chars_upper,
        'qtd_consecutive_punctuation' : consecutive_punctuations,
    }

In [None]:
clean_text('Test 123 of the function clean_text!! https://fake_url/2020')

'test  of the function cleantext '

In [None]:
get_consecutive_chars('test of the function get_consecutive_chars!! lool...')

{'qtd_consecutive_chars': 2,
 'qtd_consecutive_chars_upper': 0,
 'qtd_consecutive_punctuation': 5}

In [None]:
# Read datasets
df_full = pd.read_csv('/content/drive/MyDrive/Tweeter_Data_IN.csv')
#df_test = pd.read_csv('../input/nlp-getting-started/test.csv')

# Store idx for train and test
idx_train = df_train['id'].values
#idx_test = df_test['id'].values

In [None]:
stop_words = stopwords.words('english')

# Apply cleaning function
df_full['text_cleaned'] = df_full['text'].apply(clean_text)

# Remove stop words
df_full['text_cleaned'] = df_full['text_cleaned'].str.split() \
    .apply(lambda x: [word for word in x if word not in stop_words]) \
    .apply(lambda x: ' '.join(x))

In [None]:
df_full['qnt_words'] = df_full['text_cleaned'].str.split().apply(lambda x : len(x))
df_full['qnt_unique_words'] = df_full['text_cleaned'].str.split().apply(lambda x : len(set(x)))
df_full['qnt_chars'] = df_full['text'].str.len()
df_full['qnt_hashtags'] = df_full['text'].str.findall(r'#(\w+)').apply(lambda x : len(x))
df_full['qnt_user_mention'] = df_full['text'].str.findall(r'@(\w+)').apply(lambda x : len(x))
df_full['qnt_punctuation'] = df_full['text'].str.replace(r'[\w\s#]+', '').apply(lambda x : len(x))
df_full['qnt_urls'] = df_full['text'].str.findall(re_url).apply(lambda x : len(x))
df_full['mean_chars_words'] = df_full['text'].str.split().apply(lambda x: np.mean([len(w) for w in x]))

df_full['qnt_stop_words'] = df_full['text'].str.split() \
    .apply(lambda x: len([w for w in x if w.lower() in stop_words]))

In [None]:
df_full['contains_hashtags'] = df_full['text'].str.findall(r'#(\w+)').apply(lambda x : 0 if len(x) == 0 else 1)
df_full['contains_user_mention'] = df_full['text'].str.findall(r'@(\w+)').apply(lambda x : 0 if len(x) == 0 else 1)
df_full['contains_punctuation'] = df_full['text'].str.replace(r'[\w\s#]+', '').apply(lambda x : 0 if len(x) == 0 else 1)
df_full['contains_urls'] = df_full['text'].str.findall(re_url).apply(lambda x : len(x))

df_full['is_reply'] = df_full['text'].str.startswith('RT') + 0

In [None]:
df_consecutive = df_full['text'].apply(lambda x : pd.Series(get_consecutive_chars(x)))

for col in df_consecutive.columns:
    df_full[col] = df_consecutive[col]

In [None]:
df_full.columns

Index(['created_at', 'hashtags', 'favorite_count', 'id', 'lang', 'place',
       'retweet_count', 'text', 'tweet_url', 'user_screen_name',
       'user_description', 'user_favourites_count', 'user_followers_count',
       'user_friends_count', 'user_listed_count', 'user_location', 'user_name',
       'user_screen_name.1', 'user_statuses_count', 'text_cleaned',
       'qnt_words', 'qnt_unique_words', 'qnt_chars', 'qnt_hashtags',
       'qnt_user_mention', 'qnt_punctuation', 'qnt_urls', 'mean_chars_words',
       'qnt_stop_words', 'contains_hashtags', 'contains_user_mention',
       'contains_punctuation', 'contains_urls', 'is_reply',
       'qtd_consecutive_chars', 'qtd_consecutive_chars_upper',
       'qtd_consecutive_punctuation'],
      dtype='object')

In [None]:
df_full.head()

Unnamed: 0,created_at,hashtags,favorite_count,id,lang,place,retweet_count,text,tweet_url,user_screen_name,user_description,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name.1,user_statuses_count,text_cleaned,qnt_words,qnt_unique_words,qnt_chars,qnt_hashtags,qnt_user_mention,qnt_punctuation,qnt_urls,mean_chars_words,qnt_stop_words,contains_hashtags,contains_user_mention,contains_punctuation,contains_urls,is_reply,qtd_consecutive_chars,qtd_consecutive_chars_upper,qtd_consecutive_punctuation
0,Wed Mar 25 06:20:02 +0000 2020,,0,1242697733175220000,en,"Pune, India",0,Meditation and mindfulness tips for coping wit...,https://twitter.com/ShalomFloat/status/1242697...,ShalomFloat,Shalom float spa is the first dedicated float ...,252,382,97,24,India,Shalom Float Spa,ShalomFloat,1770,meditation mindfulness tips coping corona viru...,18,16,239,0,0,11,1,6.741935,12,0,0,1,1,0,10,0,2
1,Wed Mar 25 06:36:25 +0000 2020,,0,1242701857253980000,en,"Gandhinagar, India",0,Contribute to INDIA FIGHTS CORONA @ Gandhinaga...,https://twitter.com/prakashkalal/status/124270...,prakashkalal,,693,175,107,0,"Gandhinagar, India",prks7,prakashkalal,73,contribute india fights corona gandhinagar guj...,6,6,80,0,0,7,1,8.0,1,0,0,1,1,0,2,0,2
2,Wed Mar 25 06:18:32 +0000 2020,,0,1242697356249880000,en,"Bidhan Nagar, India",0,@DelhiPolice @DCPEastDelhi @msisodia @AamAadmi...,https://twitter.com/Bhartendulkar/status/12426...,Bhartendulkar,Senior Media Analyst @Indianpac | Ex Sr Report...,5273,609,2239,7,"Kolkata, India",Bhartendu Sharma,Bhartendulkar,6002,sir please send team immediately sanitize buil...,7,7,153,0,5,12,1,8.625,3,0,1,1,1,0,4,0,2
3,Wed Mar 25 06:05:46 +0000 2020,SSC_UFM_MAT_KARONA Corona UFM,9,1242694142242650000,en,"Maharashtra, India",19,#SSC_UFM_MAT_KARONA\n#Corona is slow poison #U...,https://twitter.com/patilkiii/status/124269414...,patilkiii,"Ab khushi de ke aazma le khuda, In ghamon se t...",1744,259,786,0,"Pune, India",ùï¨ùñáùñçùñéùñëùñÜùñòùñç ùïæùñöùñì...,patilkiii,1387,sscufmmatkaronacorona slow poison ufm instant ...,9,9,160,3,3,8,1,7.888889,4,1,1,1,1,0,4,2,2
4,Wed Mar 25 06:31:10 +0000 2020,Corona pritamkumarmurari Voice,0,1242700536752700000,en,"Bokaro Steel City, India",0,https://t.co/P7eQiq8tFa Mata Di \nDoston ise #...,https://twitter.com/SINGERMurariPa1/status/124...,SINGERMurariPa1,"Singing,Dancing,Acting,Entertainment",4,2,163,0,‡§¨‡•ã‡§ï‡§æ‡§∞‡•ã ‡§∏‡•ç‡§ü‡•Ä‡§≤ ‡§∏‡§ø‡§ü‡•...,SINGER=Murari Pathak,SINGERMurariPa1,13,mata di doston ise corona ke tarah faila bhai ...,19,19,191,3,0,5,1,5.714286,8,1,0,1,1,0,8,0,2


In [None]:
df_full.to_csv('COVID_kaggle_cleaned_dataset.csv')

#NER

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
import pandas as pd

#df = pd.read_csv('https://github.com/dipanjanS/nlp_workshop_dhs18/raw/master/Unit%2008%20-%20Project%206%20-%20Build%20your%20NER%20Tagger/ner_dataset.csv.gz', compression='gzip', encoding='ISO-8859-1')
df.info()
df = pd.read_csv('https://github.com/dipanjanS/nlp_workshop_dhs18/raw/master/Unit%2008%20-%20Project%206%20-%20Build%20your%20NER%20Tagger/ner_dataset.csv.gz', compression='gzip', encoding='ISO-8859-1')
df.info()