# 1. Importing Libraries and Data

Requirements
- dask
- geopy

In [2]:
#importing libraries
from geopy.geocoders import Nominatim
import dask.bag as db
import json

In [2]:
#JSONL data paths
mar27_20_path = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\March27_20\Hydrated\mar27_20_hydrated.jsonl"

# 2. Handling big JSON with Dask

In [3]:
mar27_dask = db.read_text(mar27_20_path).map(json.loads)

In [4]:
#filtering for only data from India
mar27_ind_dask = mar27_dask.filter(lambda tweet: tweet['place'] is not None and tweet['place']['country'] == 'India')

In [5]:
test_arr = {'coordinates': None,
  'place': {'id': '29ddac29937057a0',
   'url': 'https://api.twitter.com/1.1/geo/id/29ddac29937057a0.json',
   'place_type': 'city',
   'name': 'Mangalore',
   'full_name': 'Mangalore, India',
   'country_code': 'IN',
   'country': 'India',
   'contained_within': [],
   'bounding_box': {'type': 'Polygon',
    'coordinates': [[[74.770712, 12.752933],
      [75.014001, 12.752933],
      [75.014001, 13.128654],
      [74.770712, 13.128654]]]},
   'attributes': {}},
  'contributors': None}

test_arr["place"]["bounding_box"]["coordinates"][0][0]

[74.770712, 12.752933]

In [41]:
#function to convert Dask to DataFrame
def flatten(tweet):
    return {
        'id': tweet['id_str'],
        'longitude': tweet['place']['bounding_box']['coordinates'][0][1][0],
        'latitude': tweet['place']['bounding_box']['coordinates'][0][1][1],
        'text': tweet['full_text']
    }
    
mar27_ind_dask.map(flatten).take(1)

({'id': '1243420733230866433',
  'longitude': 75.014001,
  'latitude': 12.752933,
  'text': '@bpcretmanglore provided in-house developed sanitiser at various places inside installation premises to take care of our “warriors” from CORONA spread #StayAwareStaySafe ....@BPCLRetail @BPCLimited @8singhi https://t.co/ICXGr2HVue'},)

In [42]:
mar27_ind_df = mar27_ind_dask.map(flatten).to_dataframe()
mar27_ind_df = mar27_ind_df.compute()
mar27_ind_df.tail()

Unnamed: 0,id,longitude,latitude,text
2879,1243813117110276096,81.110202,26.642291,Nice presentation on corona virus. Slightly lo...
2880,1243813157191086080,73.113183,18.986727,I swear we are fighting two pandemics; Corona ...
2881,1243813213319266304,77.347652,28.397657,#AskZee \nGovt should not send labours to vill...
2882,1243813647085760512,77.786319,12.731936,@SridharAddepal1 @PawanKalyan Wow super ycp ha...
2883,1243813678484303872,73.280893,18.807586,Superb .. nice idea to do https://t.co/BQPxjbCPrL


# 3. Geomapping Data with State

In [43]:
#initialising
geolocator = Nominatim(user_agent="this is kunal")

In [44]:
# Latitude & Longitude input
Latitude_test = "12.752933"
Longitude_test = "75.014001"
  
location_test = geolocator.reverse(Latitude_test+","+Longitude_test)

In [45]:
location_test.raw

{'place_id': 127270795,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'way',
 'osm_id': 160307848,
 'lat': '12.7514538',
 'lon': '75.014541',
 'display_name': 'Manjeswar Road, Kanyana, Bantwal taluk, Dakshina Kannada, Karnataka, 574279, India',
 'address': {'road': 'Manjeswar Road',
  'village': 'Kanyana',
  'county': 'Bantwal taluk',
  'state_district': 'Dakshina Kannada',
  'state': 'Karnataka',
  'postcode': '574279',
  'country': 'India',
  'country_code': 'in'},
 'boundingbox': ['12.7443369', '12.7521015', '74.9943475', '75.0506924']}

In [49]:
def getState(longitude, latitude):
    geolocator = Nominatim(user_agent="this is kunal")
    location = geolocator.reverse(str(latitude) + "," + str(longitude))
    if location is not None:
        if 'state' in location.raw['address']:
            state = location.raw['address']['state']
            return state
        else:
            return None
    else:
        return None

In [50]:
state = getState(74.770712, 12.752933)
print(state)

None


In [51]:
mar27_ind_df["state"] = mar27_ind_df.apply(lambda x : getState(x["longitude"], x["latitude"]), axis=1)

In [56]:
mar27_ind_df.head(5)

Unnamed: 0,id,longitude,latitude,text,state
0,1243420733230866433,75.014001,12.752933,@bpcretmanglore provided in-house developed sa...,Karnataka
1,1243421018070249474,77.786319,12.731936,"Dear @narendramodi Pradhan sevak sir,\nThese a...",Tamil Nadu
2,1243421048172769285,77.347652,28.397657,@angelmsilos Programmers are always on the wor...,Haryana
3,1243421121858244609,77.786319,12.731936,@mlayvpatil @mpbijapur @INCVijayapura @MBPatil...,Tamil Nadu
4,1243421081538408448,88.610027,22.269493,I have created one online form to collect thei...,West Bengal


In [58]:
mar27_ind_df.state.isnull().sum()

141

We can see only about 5% of the total data did not have a State property associated with it, which is alright.

In [122]:
mar27_ind_df.state.unique()

array(['Karnataka', 'Tamil Nadu', 'Haryana', 'West Bengal',
       'Andhra Pradesh', 'Uttarakhand', 'Odisha', 'Maharashtra',
       'Uttar Pradesh', 'Gujarat', 'Himachal Pradesh', 'Telangana',
       'Bihar', 'Meghalaya', 'Chhattisgarh', 'Kerala', 'Rajasthan',
       'Jharkhand', 'Madhya Pradesh', 'চট্টগ্রাম বিভাগ', 'Assam',
       'রংপুর বিভাগ', 'Goa', 'වයඹ පළාත', 'Punjab', 'Jammu and Kashmir',
       'Delhi', 'စစ်ကိုင်းတိုင်း (Sagaing)', 'Manipur', 'খুলনা বিভাগ',
       'Arunachal Pradesh', 'Tripura', 'Puducherry', 'Chandigarh',
       'সিলেট বিভাগ', 'Mizoram', 'བསམ་རྩེ་རྫོང་ཁག་', 'Nagaland',
       'ချင်းပြည်နယ်', 'Dadra and Nagar Haveli and Daman and Diu'],
      dtype=object)

In [61]:
#removing all rows with no State values
mar27_ind_df = mar27_ind_df[~mar27_ind_df['state'].isnull()]

In [62]:
csv_path = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\March27_20\Hydrated\mar27_ind_df.csv"
mar27_ind_df.to_csv(csv_path, index=False)

# 4. Building the Pipeline to extract cleaned Indian DF from raw JSONL file

In [3]:
def getIndiaCSV(pathJSONL, pathCSV):
    # 1.loading the big JSONL into DASK for parallelization
    db_dask = db.read_text(pathJSONL).map(json.loads)
    ind_dask = db_dask.filter(lambda tweet: tweet['place'] is not None and tweet['place']['country'] == 'India')
    
    #function to only convert relevant data into Dask DF
    def flatten(tweet):
        return {
        'id': tweet['id_str'],
        'longitude': tweet['place']['bounding_box']['coordinates'][0][1][0],
        'latitude': tweet['place']['bounding_box']['coordinates'][0][1][1],
        'text': tweet['full_text']
        }
        
    ind_dask_df = ind_dask.map(flatten).to_dataframe()
    ind_df = ind_dask_df.compute() #converting Dask DF to Pandas DF
    
    # 2.extracting State through coordinates
    def getState(longitude, latitude):
        geolocator = Nominatim(user_agent="this is kunal")
        location = geolocator.reverse(str(latitude) + "," + str(longitude), language='en')
        if location is not None:
            if 'state' in location.raw['address']:
                state = location.raw['address']['state']
                return state
            else:
                return None
        else:
            return None
        
    ind_df["state"] = ind_df.apply(lambda x : getState(x["longitude"], x["latitude"]), axis=1)
    
    # 3.removing rows with null State values
    ind_df_final = ind_df[~ind_df['state'].isnull()]
    
    #exporting the final DF to CSV file
    ind_df_final.to_csv(pathCSV, index=False)

In [5]:
#March 27, 2020
mar27_path_JSONL = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\March27_20\Hydrated\\mar27_20_hydrated.jsonl"
mar27_path_CSV = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\March27_20\Hydrated\\mar27_ind_df.csv"
getIndiaCSV(mar27_path_JSONL, mar27_path_CSV)

In [4]:
#March 27, 2020
mar25_path_JSONL = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\March25_20\Hydrated\\march25_20_hydrated.jsonl"
mar25_path_CSV = "C:\Data Science\Jupyter_Workspace\Twitter_Sentiment\Data\March25_20\Hydrated\\mar25_ind_df.csv"
getIndiaCSV(mar25_path_JSONL, mar25_path_CSV)