In [2]:
!pip install -U -q preprocessor
!pip install -U -q vaderSentiment
!pip install -U -q nltk
!pip install -U -q textblob
!pip install -U -q gensim
!pip install -U -q tqdm
!pip install -U -q bs4
!pip install python-Levenshtein
!pip install -U -q plotly



In [3]:
import preprocessor as p
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
import gensim
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import seaborn as sns
import plotly.io as pio
py.init_notebook_mode(connected=True)
# Required for plotly to run on colab
pio.renderers.default = 'colab'
%matplotlib inline

In [4]:
tqdm.pandas()

**Connects to your google drive**

In [5]:
import os
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [6]:
os.chdir('/content/drive/MyDrive/SparkIntern')

In [33]:
# 100,000 rows to test the processing with limited data set
# df = pd.read_csv("india-news-headlines.csv", nrows=500000)
df = pd.read_csv("india-news-headlines.csv")


In [34]:
# Timestamp conversion to datetime without localization
df["date"] = pd.to_datetime(df["publish_date"],format='%Y%m%d', errors= "coerce").dt.tz_localize(None)

In [41]:
start_year = pd.to_datetime('2009-01-01')
data = df[df['date'] >= start_year]

In [42]:
data.head(5)

Unnamed: 0,publish_date,headline_category,headline_text,date
664738,20090101,unknown,Sterling performance for your eyes only,2009-01-01
664739,20090101,life-style.health-fitness.health-news,Six common medical myths debunked,2009-01-01
664740,20090101,life-style.health-fitness.health-news,Tired? Saunter amid greens,2009-01-01
664741,20090101,life-style.fashion.shows,No New Years bash for Krishna,2009-01-01
664742,20090101,city.ahmedabad,Second-year arts student finds a way to beat t...,2009-01-01


In [43]:
data.shape

(2759329, 4)

**Clean function using a combination of re and preprocessor**

In [44]:
def cleaning(text):
    cleantext = p.clean(text)
    cleantext = re.sub('[^a-zA-Z ]','',cleantext)
    return cleantext
def clean(data):
    #cleanr = re.compile("<.*?>")
    cleanr= re.sub(r"http\S+", "", data)
    
    cleanr= re.sub(r"pic.\S+", "", cleanr)
    #cleanr= re.sub(cleanr, "", data)
    #cleantext = re.sub(cleanr,'',data)
    cleantext = re.sub('[^a-zA-Z# ]','',cleanr)
    clean_hash = re.sub(r"#\S+","",cleantext)
    
    return clean_hash
data["text"] = data.progress_apply(lambda x:clean(str(x["headline_text"])),axis=1)

data.head(2)

100%|██████████| 2759329/2759329 [01:01<00:00, 44900.52it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,publish_date,headline_category,headline_text,date,text
664738,20090101,unknown,Sterling performance for your eyes only,2009-01-01,Sterling performance for your eyes only
664739,20090101,life-style.health-fitness.health-news,Six common medical myths debunked,2009-01-01,Six common medical myths debunked


**Tokenize and Stem and then join the text**

In [45]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
porter = PorterStemmer()

def stemSentence(sentence):
    #Tokenize words list
    tokens = [word.lower() for sent in nltk.sent_tokenize(sentence) for word in nltk.word_tokenize(sent)]
    stem_sentence=[]
    # Create list of word tokens after removing the stopwords
    filtered_sentence =[] 
    for word in tokens:
      if word not in stopwords:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**TextBlob functions to calculate Sentiment scores**

In [46]:
data["subjectivity"] = np.zeros((data.shape[0],1))
data["blob_sent"] = np.zeros((data.shape[0],1))
def textblob_sent(text):
    text = stemSentence(text)
    analysis = TextBlob(str(text))
    return analysis.sentiment[0]
def textblob_sub(text):
    text = stemSentence(text)
    analysis = TextBlob(str(text))
    return analysis.sentiment[1]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



**Calculating blob sentiment for every tweet**

In [47]:
data["blob_sent"] = data.progress_apply(lambda x:textblob_sent(str(x["text"])),axis=1)

100%|██████████| 2759329/2759329 [29:19<00:00, 1568.27it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



**Save the results to save time for processing again**

In [48]:
# data.to_csv("blob_sent_recorded_1L.csv")
data.head(2)

Unnamed: 0,publish_date,headline_category,headline_text,date,text,subjectivity,blob_sent
664738,20090101,unknown,Sterling performance for your eyes only,2009-01-01,Sterling performance for your eyes only,0.0,0.0
664739,20090101,life-style.health-fitness.health-news,Six common medical myths debunked,2009-01-01,Six common medical myths debunked,0.0,-0.3


**Subjectivity calculation for each tweet**

In [49]:
data["subjectivity"] = data.progress_apply(lambda x:textblob_sub(x["text"]),axis=1)
# data.to_csv("subjectivity_recorded_1L.csv")


100%|██████████| 2759329/2759329 [29:21<00:00, 1566.14it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
data.shape

(121460, 8)

**Drop rows with zero sentiment and sbjectivity**

In [50]:
index_names = data[ (data['subjectivity'] == 0.0) & (data['blob_sent'] == 0.000000)].index
  
# drop these given row indexes from dataFrame
data.drop(index_names, inplace = True)  



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [51]:
data.shape

(792169, 7)

**Vader sentiment**

In [52]:
sid = SentimentIntensityAnalyzer()
def vader(text):
    score = sid.polarity_scores(text)
    return score["compound"]


**VADER sentiment calculation**

In [54]:
data["vader_sent"] = data.progress_apply(lambda x:vader(str(x["text"])),axis=1)
data.head(2)

100%|██████████| 792169/792169 [01:15<00:00, 10437.83it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,publish_date,headline_category,headline_text,date,text,subjectivity,blob_sent,vader_sent
664739,20090101,life-style.health-fitness.health-news,Six common medical myths debunked,2009-01-01,Six common medical myths debunked,0.5,-0.3,0.0
664740,20090101,life-style.health-fitness.health-news,Tired? Saunter amid greens,2009-01-01,Tired Saunter amid greens,0.3,-0.2,-0.4404


**Weight corresponding to adwords are calculated here**

In [55]:
adwords = ["give","referral","referal","bonus","signup","invite","freebitcoin","lucky",
          "lottery","giveaway","coupon","prizes","enter","airdrop","game","contest","bonuses"]
def contains_word(s, w):
    return (' ' + w + ' ') in (' ' + s + ' ')
def adchecker(text):
    words = []
    count = 0
    for i in adwords:
        if (contains_word(text,i)):
            count+= 1
            words.append(i)
    return count
def adword(text):
    words = []
    count = 0
    for i in adwords:
        if (contains_word(text,i)):
            count+= 1
            words.append(i)
    return words

**Computation of adwords weight**

In [56]:
data["ad_count"] = np.zeros((data.shape[0],1))
data["adwords"] = np.zeros((data.shape[0],1))

data["ad_count"] = data.progress_apply(lambda x:adchecker(x["text"]),axis=1)
data["adwords"] = data.progress_apply(lambda x:adword(x["text"]),axis=1)

# data.to_csv("clean100.csv")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

100%|██████████| 792169/792169 [00:19<00:00, 40189.59it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

100%|██████████| 792169/792169 [00:20<00:00, 38401.36it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

**Classifies whether a tweet is an ad or not**

In [57]:
def ad_classifier(x):
    try:
        if x >1 :
            return 0
        else:
            return 1
    except:
        print(x)

data["ad_class"] = data.progress_apply(lambda x:ad_classifier(int(x["ad_count"])),axis=1)

100%|██████████| 792169/792169 [00:11<00:00, 66360.79it/s] 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



**Checking counts**

In [58]:
data.astype({'ad_count': 'float64'}).dtypes
#nlarg = data.nlargest(30,"ad_count")
count = data["ad_class"].value_counts()
print(count)

1    792144
0        25
Name: ad_class, dtype: int64


**Weightage with keywords**

In [59]:
keywords = ['onchain transaction', 'congestion', 'market cap','market cap drop', 'dispersion of returns', 'downturn', 'exchange volume', 'ATH', 'ath', 'atl', 'ATL',
            'bear trap', 'btfd', 'BTFD', 'buy wall', 'deflation', 'dump', 'fomo', 'FOMO', 'mooning', 'whale']
def keychecker(text):
    words = []
    count = 0
    for i in keywords:
        if (contains_word(str(text),i)):
            count+= 1
            words.append(i)
    return count
def keyword(text):
    words = []
    count = 0
    for i in keywords:
        if (contains_word(str(text),i)):
            count+= 1
            words.append(i)
    return words
def contains_word(s, w):
    return (' ' + w + ' ') in (' ' + s + ' ')

data["key_count"] = data.progress_apply(lambda x:keychecker(x["text"]),axis=1)
data["keywords"] = data.progress_apply(lambda x:keyword(x["text"]),axis=1)


100%|██████████| 792169/792169 [00:24<00:00, 32867.75it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

100%|██████████| 792169/792169 [00:24<00:00, 32076.76it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [60]:
promu = data.nlargest(5,"key_count")
promu

Unnamed: 0,publish_date,headline_category,headline_text,date,text,subjectivity,blob_sent,vader_sent,ad_count,adwords,ad_class,key_count,keywords
672386,20090114,city.goa,From green ghat to dump,2009-01-14,From green ghat to dump,0.3,-0.2,-0.3818,0,[],1,1,[dump]
674774,20090118,city.goa,Green zone turns dump yard,2009-01-18,Green zone turns dump yard,0.3,-0.2,-0.3818,0,[],1,1,[dump]
675858,20090120,city.delhi,New lanes to ease congestion at toll plaza,2009-01-20,New lanes to ease congestion at toll plaza,0.454545,0.136364,0.3612,0,[],1,1,[congestion]
683788,20090202,business.india-business,Expansion key to fight downturn,2009-02-02,Expansion key to fight downturn,1.0,0.0,-0.3818,0,[],1,1,[downturn]
691475,20090214,city.bengaluru,First person account: The electronic whale,2009-02-14,First person account The electronic whale,0.333333,0.25,0.0,0,[],1,1,[whale]


**Add time as index**

In [61]:
data['time'] = data['date']
data.index = data['time']




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



**Calculate daily weights using other attributes calculated in the previous steps**

In [62]:
data["Daily Weight"] = data["key_count"]*data["ad_class"]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



**Fusing daily weight weight with TextBlob and VADER respectively**

In [63]:
def daily_weight_checker(wt, sentiment_val):
  if wt != 0:
    return wt*sentiment_val
  else:
    return sentiment_val

In [64]:
# data["blob_sent"] = data["blob_sent"]*data["Daily Weight"]
data["blob_sent"] = data.progress_apply(lambda x:daily_weight_checker(x["Daily Weight"], x["blob_sent"]),axis=1)
data["vader_sent"] = data.progress_apply(lambda x:daily_weight_checker(x["Daily Weight"], x["vader_sent"]),axis=1)

100%|██████████| 792169/792169 [00:19<00:00, 39833.86it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

100%|██████████| 792169/792169 [00:19<00:00, 41593.06it/s]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [65]:
data.head(2)

Unnamed: 0_level_0,publish_date,headline_category,headline_text,date,text,subjectivity,blob_sent,vader_sent,ad_count,adwords,ad_class,key_count,keywords,time,Daily Weight
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2009-01-01,20090101,life-style.health-fitness.health-news,Six common medical myths debunked,2009-01-01,Six common medical myths debunked,0.5,-0.3,0.0,0,[],1,0,[],2009-01-01,0
2009-01-01,20090101,life-style.health-fitness.health-news,Tired? Saunter amid greens,2009-01-01,Tired Saunter amid greens,0.3,-0.2,-0.4404,0,[],1,0,[],2009-01-01,0


**Calculate daily features**

In [66]:
sentiment_grouped_daily = data.groupby(pd.Grouper(key="time", freq="D")).agg(
    Daily_Weight_mean_by_day = ('Daily Weight', np.mean),
    Daily_Weight_count_by_day = ('Daily Weight', 'count'),
    blob_sent_mean_by_day = ('blob_sent', np.mean),
    subjectivity_mean_by_day = ('subjectivity', np.mean),
    vader_sent_mean_by_day = ('vader_sent', np.mean)
)

**Drop Null rows as data will be missing fo some hour values**

In [67]:
drop_column_subset = sentiment_grouped_daily.columns.values[1:]
sentiment_grouped_daily.dropna(subset=drop_column_subset, inplace=True)
sentiment_grouped_daily.head()

Unnamed: 0_level_0,Daily_Weight_mean_by_day,Daily_Weight_count_by_day,blob_sent_mean_by_day,subjectivity_mean_by_day,vader_sent_mean_by_day
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-01-01,0.0,118,0.155071,0.44725,0.093475
2009-01-02,0.0,140,0.073924,0.479974,-0.012027
2009-01-03,0.0,137,0.085556,0.46204,0.050018
2009-01-04,0.0,116,0.089732,0.462819,0.038505
2009-01-05,0.0,125,0.109155,0.483551,0.01155


**Method to classify the movement of features**

In [68]:
def movement_classifier(x):
    try:
        if x >= 0 :
            return 1
        else:
            return 0
    except:
        print(x)

**Calculate differences to apply the movement_classifier**

In [69]:
sentiment_grouped_daily["news_diff"] = sentiment_grouped_daily["Daily_Weight_count_by_day"].diff()
sentiment_grouped_daily["blob_sent_mean_by_day_diff"] = sentiment_grouped_daily["blob_sent_mean_by_day"].diff()
sentiment_grouped_daily["vader_sent_mean_by_day_diff"] = sentiment_grouped_daily["vader_sent_mean_by_day"].diff()
sentiment_grouped_daily["news_movement"] = sentiment_grouped_daily["news_diff"].progress_apply(lambda x:movement_classifier(x))
sentiment_grouped_daily["blob_sent_movement"] = sentiment_grouped_daily["blob_sent_mean_by_day_diff"].progress_apply(lambda x:movement_classifier(x))
sentiment_grouped_daily["vader_sent_movement"] = sentiment_grouped_daily["vader_sent_mean_by_day_diff"].progress_apply(lambda x:movement_classifier(x))

100%|██████████| 4383/4383 [00:00<00:00, 583200.13it/s]
100%|██████████| 4383/4383 [00:00<00:00, 576379.82it/s]
100%|██████████| 4383/4383 [00:00<00:00, 475483.91it/s]


**Save the data to save the processing time for future use**

In [70]:
sentiment_grouped_daily.head(5)
os.chdir("/content/drive/MyDrive/SparkIntern")
sentiment_grouped_daily.to_csv("sentiments_processed_data.csv")

In [None]:
sentiment_grouped_daily.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3534 entries, 2001-01-02 to 2010-10-26
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Daily_Weight_mean_by_day     3534 non-null   float64
 1   Daily_Weight_count_by_day    3534 non-null   int64  
 2   blob_sent_mean_by_day        3534 non-null   float64
 3   subjectivity_mean_by_day     3534 non-null   float64
 4   vader_sent_mean_by_day       3534 non-null   float64
 5   news_diff                    3533 non-null   float64
 6   blob_sent_mean_by_day_diff   3533 non-null   float64
 7   vader_sent_mean_by_day_diff  3533 non-null   float64
 8   news_movement                3534 non-null   int64  
 9   blob_sent_movement           3534 non-null   int64  
 10  vader_sent_movement          3534 non-null   int64  
dtypes: float64(7), int64(4)
memory usage: 331.3 KB
