In [1]:
#Basic libraries
import pandas as pd 
import numpy as np 
#Ignore warnings
import warnings
warnings.filterwarnings('ignore')
#Visualization libraries
import matplotlib.pyplot as plt 
from matplotlib import rcParams
import seaborn as sns
from textblob import TextBlob
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import iplot
%matplotlib inline

In [2]:
raw_reviews = pd.read_csv('combined_csv1.csv')

In [3]:
print (raw_reviews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22149 entries, 0 to 22148
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Tweet_time  22149 non-null  object 
 1   Id          22149 non-null  float64
 2   user name   22149 non-null  object 
 3   Place       15306 non-null  object 
 4   Text        22149 non-null  object 
dtypes: float64(1), object(4)
memory usage: 865.3+ KB
None


In [4]:
raw_reviews['Tweet_time'] = pd.to_datetime(raw_reviews.Tweet_time, format='%d-%m-%Y %H:%M')

In [5]:
print (raw_reviews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22149 entries, 0 to 22148
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Tweet_time  22149 non-null  datetime64[ns]
 1   Id          22149 non-null  float64       
 2   user name   22149 non-null  object        
 3   Place       15306 non-null  object        
 4   Text        22149 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 865.3+ KB
None


In [6]:
#Creating a copy
process_reviews=raw_reviews.copy()

#Checking for null values
process_reviews.isnull().sum()

Tweet_time       0
Id               0
user name        0
Place         6843
Text             0
dtype: int64

In [7]:
process_reviews['Place']=process_reviews['Place'].fillna('N.A')

In [8]:
process_reviews['Tweet_time'].dt.tz_localize('utc').dt.tz_convert('Asia/Kolkata')

0       2021-05-02 19:28:00+05:30
1       2021-05-02 19:28:00+05:30
2       2021-05-02 19:28:00+05:30
3       2021-05-02 19:27:00+05:30
4       2021-05-02 19:27:00+05:30
                   ...           
22144   2021-05-02 23:11:00+05:30
22145   2021-05-02 23:11:00+05:30
22146   2021-05-02 23:11:00+05:30
22147   2021-05-02 23:11:00+05:30
22148   2021-05-02 23:10:00+05:30
Name: Tweet_time, Length: 22149, dtype: datetime64[ns, Asia/Kolkata]

In [9]:
#Creating a copy 
clean_reviews=process_reviews.copy()
import re
import string

In [10]:
def review_cleaning(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [11]:
process_reviews['Text']=process_reviews['Text'].apply(lambda x:review_cleaning(x))
process_reviews.head()

Unnamed: 0,Tweet_time,Id,user name,Place,Text
0,2021-05-02 13:58:00,1.38886e+18,rishitagupta_,N.A,rt matchday\r\r\r\rpkbs klrahul dc cricket p...
1,2021-05-02 13:58:00,1.38886e+18,soul_cricket,N.A,rt matchday\r\r\r\rpkbs klrahul dc cricket p...
2,2021-05-02 13:58:00,1.38886e+18,Im4TestCricket,Bangalore-Srikalahasti,rt adsutherland if the ipl were a first class ...
3,2021-05-02 13:57:00,1.38886e+18,JuliasMotaung,"Johannesburg,Gauteng",pirates vs sundowns match is boring ☹️ i rathe...
4,2021-05-02 13:57:00,1.38886e+18,CricTelegraph,At Cricket,punjab kings playing xi 🏏\r\ripl pbks delhica...


In [12]:
stop_words= ['yourselves', 'between', 'whom', 'itself', 'is', "she's", 'up', 'herself', 'here', 'your', 'each', 
             'we', 'he', 'my', "you've", 'having', 'in', 'both', 'for', 'themselves', 'are', 'them', 'other',
             'and', 'an', 'during', 'their', 'can', 'yourself', 'she', 'until', 'so', 'these', 'ours', 'above', 
             'what', 'while', 'have', 're', 'more', 'only', "needn't", 'when', 'just', 'that', 'were', "don't", 
             'very', 'should', 'any', 'y', 'isn', 'who',  'a', 'they', 'to', 'too', "should've", 'has', 'before',
             'into', 'yours', "it's", 'do', 'against', 'on',  'now', 'her', 've', 'd', 'by', 'am', 'from', 
             'about', 'further', "that'll", "you'd", 'you', 'as', 'how', 'been', 'the', 'or', 'doing', 'such',
             'his', 'himself', 'ourselves',  'was', 'through', 'out', 'below', 'own', 'myself', 'theirs', 
             'me', 'why', 'once',  'him', 'than', 'be', 'most', "you'll", 'same', 'some', 'with', 'few', 'it',
             'at', 'after', 'its', 'which', 'there','our', 'this', 'hers', 'being', 'did', 'of', 'had', 'under',
             'over','again', 'where', 'those', 'then', "you're", 'i', 'because', 'does', 'all']

In [13]:
process_reviews['Text'] = process_reviews['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
process_reviews.head()

Unnamed: 0,Tweet_time,Id,user name,Place,Text
0,2021-05-02 13:58:00,1.38886e+18,rishitagupta_,N.A,rt matchday pkbs klrahul dc cricket punjabking...
1,2021-05-02 13:58:00,1.38886e+18,soul_cricket,N.A,rt matchday pkbs klrahul dc cricket punjabking...
2,2021-05-02 13:58:00,1.38886e+18,Im4TestCricket,Bangalore-Srikalahasti,rt adsutherland if ipl first class tournament ...
3,2021-05-02 13:57:00,1.38886e+18,JuliasMotaung,"Johannesburg,Gauteng",pirates vs sundowns match boring ☹️ rather wat...
4,2021-05-02 13:57:00,1.38886e+18,CricTelegraph,At Cricket,punjab kings playing xi 🏏 ipl pbks delhicapita...


In [14]:
process_reviews['polarity_C'] = process_reviews['Text'].map(lambda text: TextBlob(text).sentiment.polarity)
process_reviews['subjectivity'] = process_reviews['Text'].map(lambda text: TextBlob(text).sentiment.subjectivity)

In [15]:
def f(row):
    
    '''This function returns sentiment value based on the overall ratings from the user'''
    
    if row['polarity_C'] == 0.0:
        val = 0
    elif row['polarity_C'] < 0.0:
        val = -1
    elif row['polarity_C'] > 0.0:
        val = 1
    else:
        val = -1
    return val

In [16]:
#Applying the function in our new column
process_reviews['sentiment'] = process_reviews.apply(f, axis=1)
process_reviews.head()

Unnamed: 0,Tweet_time,Id,user name,Place,Text,polarity_C,subjectivity,sentiment
0,2021-05-02 13:58:00,1.38886e+18,rishitagupta_,N.A,rt matchday pkbs klrahul dc cricket punjabking...,0.0,0.0,0
1,2021-05-02 13:58:00,1.38886e+18,soul_cricket,N.A,rt matchday pkbs klrahul dc cricket punjabking...,0.0,0.0,0
2,2021-05-02 13:58:00,1.38886e+18,Im4TestCricket,Bangalore-Srikalahasti,rt adsutherland if ipl first class tournament ...,0.075,0.616667,1
3,2021-05-02 13:57:00,1.38886e+18,JuliasMotaung,"Johannesburg,Gauteng",pirates vs sundowns match boring ☹️ rather wat...,-1.0,1.0,-1
4,2021-05-02 13:57:00,1.38886e+18,CricTelegraph,At Cricket,punjab kings playing xi 🏏 ipl pbks delhicapita...,0.0,0.0,0


In [17]:
process_reviews['sentiment'].value_counts()

 0    12355
 1     7918
-1     1876
Name: sentiment, dtype: int64

In [18]:
df = process_reviews.groupby(raw_reviews['Tweet_time']).mean()

In [19]:
df['sentiment'].value_counts()

 0.000000    5
 0.500000    5
 0.200000    3
 0.142857    3
 0.285714    3
            ..
 0.427350    1
 0.642857    1
 0.191489    1
-0.006711    1
 0.750000    1
Name: sentiment, Length: 230, dtype: int64

In [20]:
df['sentiment1'] = df.apply(f, axis=1)

In [21]:
def f1(row):
    
    '''This function returns sentiment value based on the overall ratings from the user'''
    
    if row['sentiment'] == 0.0:
        val = 0
    elif row['sentiment'] < 0.0:
        val = -1
    elif row['sentiment'] > 0.0:
        val = 1
    else:
        val = -1
    return val

In [None]:
df['sentiment2'] = process_reviews.apply(f, axis=1)

In [None]:
df.to_csv('out4.csv')

In [None]:
df.head()

In [None]:
df.rename(columns={'time': 'Tweet_time'})

In [None]:
df.rename(columns={'Tweet_time': 'time'}, inplace=True)

In [23]:
df.head()

Unnamed: 0_level_0,Id,polarity_C,subjectivity,sentiment,sentiment1
Tweet_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-05-02 13:35:00,1.38885e+18,0.369697,0.34375,0.75,1
2021-05-02 13:36:00,1.38885e+18,0.109921,0.151984,0.142857,1
2021-05-02 13:37:00,1.38885e+18,0.142308,0.140705,0.384615,1
2021-05-02 13:38:00,1.38885e+18,0.122222,0.144444,0.0,1
2021-05-02 13:39:00,1.38885e+18,0.127273,0.35,0.181818,1


In [24]:
df['Tweet_time'].dt.strftime('%Y-%m-%d %H:%M:%S')

KeyError: 'Tweet_time'