# Table of Contents

### I. Loading and Preprocessing Data
### II. Extracting Text based Features
> ##### 1. Special Characters and Numbers
> ##### 2. Word Count
> ##### 3. Number of Characters
> ##### 4. Average Word Length
> ##### 5. Stop words
> ##### 6. POS tags
> ##### 7. NER

# I. Loading and Preprocessing Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Import libraries
import pandas as pd
import re

In [3]:
# Import spacy library
import spacy
# Import stopwords from spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load English language model
nlp = spacy.load('en_core_web_sm')

In [4]:
# Load dataset
df = pd.read_csv(r'/content/drive/My Drive/tweets.csv', nrows=1000)

In [5]:
# Explore dataset
df.head()

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted
0,RT @rssurjewala: Critical question: Was PayTM ...,False,0,,2016-11-23 18:40:30,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331,True,False
1,RT @Hemant_80: Did you vote on #Demonetization...,False,0,,2016-11-23 18:40:29,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",PRAMODKAUSHIK9,66,True,False
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0,,2016-11-23 18:40:03,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12,True,False
3,RT @ANI_news: Gurugram (Haryana): Post office ...,False,0,,2016-11-23 18:39:59,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",deeptiyvd,338,True,False
4,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0,,2016-11-23 18:39:39,False,,8.014954e+17,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120,True,False


In [6]:
# Only keep text column
df.drop(df.columns[1:], axis=1, inplace=True)

In [7]:
# Dataframe
df.head()

Unnamed: 0,text
0,RT @rssurjewala: Critical question: Was PayTM ...
1,RT @Hemant_80: Did you vote on #Demonetization...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...
4,RT @satishacharya: Reddy Wedding! @mail_today ...


In [8]:
# Example Tweet
df.loc[0,'text']

"RT @rssurjewala: Critical question: Was PayTM informed about #Demonetization edict by PM? It's clearly fishy and requires full disclosure &amp;\x85"

In [9]:
# Example Tweet
df.loc[512,'text']

'RT @smita_muk: BREAKING NEWS\r\nPMapps result amnounced!\r\n90% Indians support #demonetization\r\n<ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><U+270C><U+270C><U+270C><U+270C><U+270C><U+270C><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086>\r\n@narendramodi Zindabad!'

In [10]:
# Preprocess tweets
def preprocess(text):

    # Remove unicode characters
    text = re.sub(r"<U\+[A-Z0-9]+>|<ed>", "", text)
    # Remove newline and rawstring characters
    text = re.sub(r"\n|\r", "", text)

    return text

In [11]:
# Apply function
df['text'] = df['text'].apply(preprocess)

In [12]:
# Print dataframe
df.head()

Unnamed: 0,text
0,RT @rssurjewala: Critical question: Was PayTM ...
1,RT @Hemant_80: Did you vote on #Demonetization...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...
4,RT @satishacharya: Reddy Wedding! @mail_today ...


In [13]:
df.shape

(1000, 1)

# II. Extracting Text based Features

## 1. Special characters

### 1.1 Number of mentions used in Tweets

In [14]:
# Function to count number of mentions in Tweet
def mentions(text):
    
    # Find mentions
    mentions = re.findall('@\w+', text)

    # Return count of mentions
    return len(mentions)

In [15]:
# Apply function
df['mentions_count'] = df['text'].apply(mentions)

In [16]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1
1,RT @Hemant_80: Did you vote on #Demonetization...,1
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2
5,@DerekScissors1: Indias #demonetization: #Bla...,2
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1
7,RT @Joydeep_911: Calling all Nationalists to j...,1
8,RT @sumitbhati2002: Many opposition leaders ar...,2
9,National reform now destroyed even the essence...,0


In [17]:
# Describe
df['mentions_count'].describe()

count    1000.000000
mean        0.866000
std         0.988444
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         8.000000
Name: mentions_count, dtype: float64

### 1.2 Number of hashtags used in Tweets

In [18]:
# Function to count number of hashtags in Tweet
def hashtags(text):
    
    # Find hashtags
    hashtags = re.findall('#\w+', text)

    # Return count of hashtags
    return len(hashtags)

In [19]:
# Apply function
df['hashtags_count'] = df['text'].apply(hashtags)

In [20]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1
9,National reform now destroyed even the essence...,0,1


In [21]:
# Describe
df['hashtags_count'].describe()

count    1000.000000
mean        1.688000
std         1.272114
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        10.000000
Name: hashtags_count, dtype: float64

### 1.3 Number of name titles in Tweet

In [22]:
# Function to count name titles Tweet
def title(text):
    count = re.findall('Mr\.|Mrs\.|Dr\.|Miss\s*', text)
    return len(count)

In [23]:
# Test output
df['text'].apply(title)

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: text, Length: 1000, dtype: int64

## 2.1 Word Count

In [24]:
# List comprehension to count number of words in Tweet
df['word_count'] = [len(i.split()) for i in df['text']]

In [25]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17
9,National reform now destroyed even the essence...,0,1,18


In [26]:
# Describe
df['word_count'].describe()

count    1000.000000
mean       16.685000
std         4.566468
min         3.000000
25%        14.000000
50%        17.000000
75%        20.000000
max        28.000000
Name: word_count, dtype: float64

## 2.2 Number of Characters

In [27]:
# List comprehension to count number of characters in Tweet
df['character_count'] = [len(i) for i in df['text']]

In [28]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139
9,National reform now destroyed even the essence...,0,1,18,140


In [29]:
# Describe
df['character_count'].describe()

count    1000.000000
mean      124.673000
std        21.427861
min        34.000000
25%       117.000000
50%       135.000000
75%       139.000000
max       148.000000
Name: character_count, dtype: float64

## 3. Average Word Length

In [30]:
# Function to calculate average word length of a Tweet
def avg_word_len(text):
    
    # Variable to store word lengths
    word_lens = 0
    
    # Iterate over all the words in Tweet
    for token in text.split():
        word_lens += len(token)
    
    # Number of words in Tweet
    word_count = text.split()

    # Return average length of words in Tweet
    return word_lens/len(word_count)

In [31]:
# Apply function
df['avg_word_len'] = df['text'].apply(avg_word_len)

In [32]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,5.571429
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,7.75
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,11.0
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,9.166667
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,5.5
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,6.722222
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,7.176471
9,National reform now destroyed even the essence...,0,1,18,140,6.777778


In [33]:
# Describe
df['avg_word_len'].describe()

count    1000.000000
mean        6.865341
std         1.698611
min         3.892857
25%         5.650000
50%         6.578947
75%         7.648810
max        16.666667
Name: avg_word_len, dtype: float64

## 4. Stopwords

In [34]:
# Function to count the number of stopwords in Tweets
def stopwords(text):

    # Create spacy object
    doc = nlp(text)

    # Variable to store count of stopwords
    count = 0
    for token in doc:
        if token.is_stop == True:
            count += 1
    return count

In [35]:
# Apply function
df['stopwords'] = df['text'].apply(stopwords)

In [36]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len,stopwords
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2,7
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909,4
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,5.571429,5
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,7.75,2
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,11.0,0
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,9.166667,4
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,5.5,11
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,6.722222,8
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,7.176471,8
9,National reform now destroyed even the essence...,0,1,18,140,6.777778,7


In [37]:
# Describe
df['stopwords'].describe()

count    1000.000000
mean        6.038000
std         3.285019
min         0.000000
25%         4.000000
50%         6.000000
75%         8.000000
max        19.000000
Name: stopwords, dtype: float64

## 5. POS tags

In [38]:
# Function to calculate the number of specific POS tags
def pos(text):

    # Create spacy object
    doc = nlp(text)

    # Variable to store the count of POS tags
    count = 0

    # Iterate over the tokens
    for token in doc:
        # Count Noun, Preposition, Adjective
        if token.pos_ in ["NOUN","ADP","ADJ"]:
            count += 1
    
    # Return the count
    return count

In [39]:
# Apply function
df['pos'] = df['text'].apply(pos)

In [40]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len,stopwords,pos
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2,7,12
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909,4,5
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,5.571429,5,4
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,7.75,2,11
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,11.0,0,6
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,9.166667,4,5
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,5.5,11,12
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,6.722222,8,5
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,7.176471,8,10
9,National reform now destroyed even the essence...,0,1,18,140,6.777778,7,9


In [41]:
# Describe
df['pos'].describe()

count    1000.000000
mean        7.562000
std         2.995184
min         0.000000
25%         5.000000
50%         8.000000
75%        10.000000
max        20.000000
Name: pos, dtype: float64

## 6. NER

In [42]:
# Function to count NER
def ner(text):

    # Create spacy object
    doc = nlp(text)

    # Variable to store the count of POS tags
    count = 0

    # Iterate over the tokens
    for ent in doc.ents:
        # Increment counter if token is a NER
        if ent.label_:
            count += 1
    # Return count
    return count

In [43]:
# Apply function
df['ner'] = df['text'].apply(ner)

In [44]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len,stopwords,pos,ner
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2,7,12,3
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909,4,5,1
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,5.571429,5,4,3
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,7.75,2,11,2
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,11.0,0,6,0
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,9.166667,4,5,2
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,5.5,11,12,3
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,6.722222,8,5,1
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,7.176471,8,10,0
9,National reform now destroyed even the essence...,0,1,18,140,6.777778,7,9,2


In [45]:
# Describe
df['ner'].describe()

count    1000.00000
mean        1.74100
std         1.35046
min         0.00000
25%         1.00000
50%         2.00000
75%         3.00000
max         8.00000
Name: ner, dtype: float64