Here, we are averaging out all the Sentiment Scores for each state in India, so that in the end we have a DF with scores for all Indian States, after which we can do GeoMapping in Notebook 5. 

# Importing Libraries and Data

In [1]:
!pip install geojson



In [2]:
#importing libaries
import pandas as pd 

In [3]:
#data path for file with vader score for individual coordinates
scoresCSV_path = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\\March27_20\Hydrated\\mar27_score_df.csv"
stateCSV_path = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\\March27_20\Hydrated\\mar27_state_df.csv"
mar27_score = pd.read_csv(scoresCSV_path)

# Grouping the Data by State

In [4]:
mar27_score.head(5)

Unnamed: 0,id,longitude,latitude,text,state,vader_text,vader_score
0,1243420733230866433,75.014001,12.752933,@bpcretmanglore provided in-house developed sa...,Karnataka,provided in-house developed sanitiser at vari...,0.4939
1,1243421018070249474,77.786319,12.731936,"Dear @narendramodi Pradhan sevak sir,\nThese a...",Tamil Nadu,"Dear Pradhan sevak sir,\nThese are few tasks ...",0.8402
2,1243421048172769285,77.347652,28.397657,@angelmsilos Programmers are always on the wor...,Haryana,Programmers are always on the work. It does n...,-0.3678
3,1243421121858244609,77.786319,12.731936,@mlayvpatil @mpbijapur @INCVijayapura @MBPatil...,Tamil Nadu,please take note to avoid potential spr...,0.0258
4,1243421081538408448,88.610027,22.269493,I have created one online form to collect thei...,West Bengal,I have created one online form to collect thei...,0.5719


In [5]:
mar27_score['state'].value_counts()

Maharashtra                                 547
Haryana                                     451
Uttar Pradesh                               378
Tamil Nadu                                  273
West Bengal                                 131
Gujarat                                     130
Karnataka                                   110
Telangana                                   109
Madhya Pradesh                               71
Rajasthan                                    69
Bihar                                        69
Odisha                                       68
Jammu and Kashmir                            55
Jharkhand                                    53
Andhra Pradesh                               33
Assam                                        27
Uttarakhand                                  26
Chhattisgarh                                 26
Punjab                                       25
Kerala                                       22
North Western Province                  

In [6]:
#grouping and taking the mean of scores by state
mar27_state = mar27_score.groupby('state', as_index=False)['vader_score'].mean()

In [7]:
mar27_state

Unnamed: 0,state,vader_score
0,Andhra Pradesh,0.251303
1,Arunachal Pradesh,-0.16602
2,Assam,0.071552
3,Bihar,0.080788
4,Chandigarh,0.3182
5,Chattogram Division,0.08625
6,Chhattisgarh,0.023615
7,Chin,0.0
8,Dadra and Nagar Haveli and Daman and Diu,0.0
9,Delhi,0.39173


In [8]:
#correcting state names to match with GeoJSON Keys
mar27_state["state"].replace({"Jammu and Kashmir": "Jammu & Kashmir", 
                              "Dadra and Nagar Haveli and Daman and Diu": "Dadara & Nagar Havelli", 
                              "Arunachal Pradesh": "Arunanchal Pradesh", 
                              "Delhi": "NCT of Delhi"}, inplace=True)

# Filtering the States to match with GeoJSON states

In [9]:
import geojson
with open('states_india.geojson') as f:
    gj = geojson.load(f)

In [10]:
#locating the index with state value in the data file
gj['features'][35]["properties"]["st_nm"]

'NCT of Delhi'

In [11]:
#making a list of unique states in the GeoJSON file
state_list = []
for i in range(0, 36):
    state_list.append(gj['features'][i]["properties"]["st_nm"])
    
state_list = sorted(state_list)
state_list

['Andaman & Nicobar Island',
 'Andhra Pradesh',
 'Arunanchal Pradesh',
 'Assam',
 'Bihar',
 'Chandigarh',
 'Chhattisgarh',
 'Dadara & Nagar Havelli',
 'Daman & Diu',
 'Goa',
 'Gujarat',
 'Haryana',
 'Himachal Pradesh',
 'Jammu & Kashmir',
 'Jharkhand',
 'Karnataka',
 'Kerala',
 'Lakshadweep',
 'Madhya Pradesh',
 'Maharashtra',
 'Manipur',
 'Meghalaya',
 'Mizoram',
 'NCT of Delhi',
 'Nagaland',
 'Odisha',
 'Puducherry',
 'Punjab',
 'Rajasthan',
 'Sikkim',
 'Tamil Nadu',
 'Telangana',
 'Tripura',
 'Uttar Pradesh',
 'Uttarakhand',
 'West Bengal']

In [12]:
#checking for missing values between mar27 states and unique geo states
print("Missing values in GeoJSON State List:", (set(mar27_state["state"]).difference(state_list)))

Missing values in GeoJSON State List: {'Sylhet Division', 'Samtse District', 'Rangpur Division', 'Chin', 'Khulna Division', 'Sagaing', 'North Western Province', 'Chattogram Division'}


# Building the Pipeline for State Mean Score

In [13]:
def get_state_score(score_path, state_path):
    # 1.grouping to get the score for each state
    score_df = pd.read_csv(score_path)
    state_df = score_df.groupby('state', as_index=False)['vader_score'].mean()
    
    # 2.correcting state names to match with GeoJSON Keys
    state_df["state"].replace({"Jammu and Kashmir": "Jammu & Kashmir", 
                              "Dadra and Nagar Haveli and Daman and Diu": "Dadara & Nagar Havelli", 
                              "Arunachal Pradesh": "Arunanchal Pradesh", 
                              "Delhi": "NCT of Delhi"}, inplace=True)
    
    # 3. removing rows for invalid states
    state_list = ['Andaman & Nicobar Island', 'Andhra Pradesh', 'Arunanchal Pradesh', 'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh',
                  'Dadara & Nagar Havelli', 'Daman & Diu', 'Goa', 'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu & Kashmir',
                  'Jharkhand', 'Karnataka', 'Kerala', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram',
                  'NCT of Delhi', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
                  'Uttar Pradesh', 'Uttarakhand', 'West Bengal']
    state_df.drop(state_df[~state_df["state"].isin(state_list)].index, inplace=True)
    
    # 4.exporting state DF to CSV
    state_df.to_csv(state_path, index=False)

In [14]:
#March 27, 2020
get_state_score(scoresCSV_path, stateCSV_path)

In [15]:
#March 25, 2020
mar25_score = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\\March25_20\Hydrated\\mar25_score_df.csv"
mar25_state = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\\March25_20\Hydrated\\mar25_state_df.csv"
get_state_score(mar25_score, mar25_state)

# Pipeline for Appending Date to CSV

In [16]:
import re
march_25 = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\\March25_20\Hydrated\\25032020.csv"
res = re.findall("(\d+).csv", march_25)
date = (int)(res[0])
date

25032020

In [17]:
mar27_state["date"] = date

In [18]:
mar27_state.head(5)

Unnamed: 0,state,vader_score,date
0,Andhra Pradesh,0.251303,25032020
1,Arunanchal Pradesh,-0.16602,25032020
2,Assam,0.071552,25032020
3,Bihar,0.080788,25032020
4,Chandigarh,0.3182,25032020


In [19]:
mar25_state

'C:\\Data Science\\Jupyter_Workspace\\Twitter_Sentiment\\Data\\March25_20\\Hydrated\\mar25_state_df.csv'

In [20]:
df1 = pd.DataFrame({"state":['Andhra Pradesh', 'Arunanchal Pradesh'],
                         "vader_score":[0.251303, -0.166020],
                         "date": [25032020, 25032020]})
  
df2 = pd.DataFrame({"state":['Assam', 'Bihar'],
                         "vader_score":[0.071552, 0.080788],
                         "date": [25032020, 25032020]})

df1 = df1.append(df2, ignore_index = True)

In [21]:
df1

Unnamed: 0,state,vader_score,date
0,Andhra Pradesh,0.251303,25032020
1,Arunanchal Pradesh,-0.16602,25032020
2,Assam,0.071552,25032020
3,Bihar,0.080788,25032020
