In [21]:
"""
Conventions to follow :

1) Place the downloaded tweets(JSON files) on your desktop screen inside a folder named as "tweets_json".

Note : The conversion of coordinate to value takes some time to execute, Have Patience :P
   
"""

'\nConventions to follow :\n\n1) Place the downloaded tweets(JSON files) on your desktop screen inside a folder named as "tweets_json".\n\nNote : The conversion of coordinate to value takes some time to execute, Have Patience :P\n   \n'

In [32]:
# Importing libraries
import glob,os
import pandas as pd
from tqdm import tqdm_notebook
import numpy as np

import matplotlib.pyplot as plt
import utm
import math
from shapely.geometry import Point, Polygon

from twokenize3 import *
from sentiment import *

In [33]:
# Geospatial Grid
def generateGrid(coors, distance_offset):
    """ Getting the grids of 1000 meters square.
        parameters: minx, miny, maxx, maxy 
        returns: grid of the 
    """
    dx = distance_offset
    dy = distance_offset

    nx = int(math.ceil(abs(coors['up_right_x'] - coors['low_left_x'])/dx))
    ny = int(math.ceil(abs(coors['up_right_y'] - coors['low_left_y'])/dy))
    grid = []

    for i in range(ny):
        for j in range(nx):
            vertices = []
            vertices.append([coors['low_left_x']+dx*j,coors['low_left_y']+dy*i])
            vertices.append([coors['low_left_x']+dx*(j+1),coors['low_left_y']+dy*i])
            vertices.append([coors['low_left_x']+dx*(j+1),coors['low_left_y']+dy*(i+1)])
            vertices.append([coors['low_left_x']+dx*j,coors['low_left_y']+dy*(i+1)])
            grid.append(vertices)
            
    return grid

def pointInPolygon(point, polygons):
    pt = Point(point)
    for poly in polygons:
        pol = Polygon(poly)
        if(pol.contains(pt)):
            return(polygons.index(poly))
    return np.nan

In [34]:
# minimum and maximum X and Y coordinates to create the grid and the grid offset
def initialise_grid():    
    offset = 1000
    coors = {'low_left_x' : 421710.112401581, 'low_left_y' : 4610737.961457818, 'up_right_x' : 456608.39121255605, 'up_right_y' : 4652466.087380382}
    grid = generateGrid(coors, offset)
    return grid

In [35]:
# Creates hdf5 documents on the basis of c2v values
def makeDocuments(c2v_values,dataset):
    desktop_path = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop')
    os.chdir(desktop_path)
    directory = "hdf5_Doc"
    if not os.path.exists(directory):
        os.makedirs(directory)
    os.chdir(desktop_path + "/" + directory)
    for values in c2v_values:
        doc = dataset[dataset["c2v"] == values]
        filename = str(values) + ".hdf5"
        doc.reset_index(drop=True)
        doc.to_hdf(filename,'key')

In [36]:
# Processing dataframe
def create_dataframe():
    desktop_path = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop')
    tweets_path = desktop_path + "/tweets_json/*.json"
    tweets = pd.concat(map(pd.read_json, tqdm_notebook(glob.glob(tweets_path))))
    return tweets

# Removing the null values from the input dataframe
def filter_dataframe(data):
    # coordinates = longitude
    # geo = lattitude
    headers = ["geo", "text","lang","timestamp_ms"]
    df = data[headers]
    for columns in headers:
        df = df[df[columns].notnull()]
    # Considering the tweets only in english language
    return df[df["lang"] == "en"].reset_index(drop=True)

# Converts the coordinates to their respective grid index
def c2v(dataset):
    c2v_list = []
    grid = initialise_grid()
    for index,values in enumerate(dataset["geo"]):
        pt = utm.from_latlon(values["coordinates"][0],values["coordinates"][1])[0:2]
        # Coordinate to value function
        c2v_list.append(pointInPolygon(pt, grid))
    dataset["c2v"] = c2v_list
    return dataset

In [37]:
# Initialising dataframe
def initialise_dataframe():
    df = create_dataframe()
    df = filter_dataframe(df)
    return df

# Process the dataframe to get sentiment and tokenized values
def process_dataframe(dataset):
    myDic = {}
    tweetList,temp = ([] for i in range(2))
    dataset = c2v(dataset)
    dataset["sentiment_text"] = dataset["text"].apply(findSentimentTweet)
    dataset["twokenized_text"] = dataset["text"].apply(tokenizeRawTweetText)
    dataset.dropna(inplace=True)
    dataset.sort_values(['c2v'],inplace=True)
    dataset["c2v"] = dataset["c2v"].astype(int)
    c2v_list = dataset["c2v"].unique()
    makeDocuments(c2v_list,dataset)
    desktop_path = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop')
    directory = desktop_path + "/" + "hdf5_Doc"
    for filename in os.listdir(os.getcwd()):
        if filename.endswith(".hdf5"):
            doc = pd.read_hdf(filename)
            for values in doc["text"]:
                temp.append(values)
                tweetList.append(temp)
            myDic[str(filename)] = findSentimentDoc(tweetList[-1])
        else:
            pass
    return dataset,myDic

In [38]:
def final_function():
    df = initialise_dataframe()
    df,myDict = process_dataframe(df)
    return df,myDict

In [39]:
df,myDict = final_function()

A Jupyter Widget




your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['geo', 'text', 'lang', 'twokenized_text']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [30]:
df

Unnamed: 0,geo,text,lang,timestamp_ms,c2v,sentiment_text,twokenized_text
1644,"{'type': 'Point', 'coordinates': [41.6545, -87...",Light fresh beef. But the after taste just as ...,en,2017-12-09 17:46:47.411,16,-0.113402,"[Light, fresh, beef, ., But, the, after, taste..."
10498,"{'type': 'Point', 'coordinates': [41.64943326,...",Feels good to be back at my gym since 2012; Da...,en,2017-12-11 05:33:21.327,16,0.052167,"[Feels, good, to, be, back, at, my, gym, since..."
9230,"{'type': 'Point', 'coordinates': [41.65160259,...",#BlackPanther @ AMC Theatre-Crestwood 18 https...,en,2017-12-09 19:59:40.119,16,-0.093995,"[#BlackPanther, @, AMC, Theatre-Crestwood, 18,..."
2665,"{'type': 'Point', 'coordinates': [41.64943326,...",Sunday Cardio Sessions - on the bike 🏍 listeni...,en,2017-12-11 02:32:10.718,16,-0.335063,"[Sunday, Cardio, Sessions, -, on, the, bike, 🏍..."
7452,"{'type': 'Point', 'coordinates': [41.6497, -87...",Candie sugars not too sweet ..slight vanilla.....,en,2017-12-10 23:44:59.379,21,0.356399,"[Candie, sugars, not, too, sweet, .., slight, ..."
3895,"{'type': 'Point', 'coordinates': [41.6499, -87...",Mash behaving like a good sour mash should! @ ...,en,2017-12-10 17:11:40.139,21,-0.273820,"[Mash, behaving, like, a, good, sour, mash, sh..."
1575,"{'type': 'Point', 'coordinates': [41.65462, -8...",THE AFTER PARTY IS IN ACTION AT club_krave!!! ...,en,2017-12-11 00:11:48.497,21,0.066426,"[THE, AFTER, PARTY, IS, IN, ACTION, AT, club_k..."
555,"{'type': 'Point', 'coordinates': [41.6497, -87...",Hey at a book signing 4 @bobbyskafish with a ...,en,2017-12-10 22:43:18.576,21,0.794114,"[Hey, at, a, book, signing, 4, @bobbyskafish, ..."
2271,"{'type': 'Point', 'coordinates': [41.6490841, ...",Next sat will be lit showtime 7pm don’t be lat...,en,2017-12-09 04:12:03.436,21,-0.459039,"[Next, sat, will, be, lit, showtime, 7pm, don’..."
2459,"{'type': 'Point', 'coordinates': [41.65077, -8...",I went out alone last night. I caught up on em...,en,2017-12-09 20:24:45.715,21,-0.496110,"[I, went, out, alone, last, night, ., I, caugh..."


In [31]:
myDict

{'1000.hdf5': -0.14265829994858492,
 '1001.hdf5': -0.14943281550036017,
 '1002.hdf5': -0.15434988273368022,
 '1003.hdf5': -0.1521483259531615,
 '1004.hdf5': -0.15575132732976954,
 '1005.hdf5': -0.141143204247339,
 '1006.hdf5': -0.15565735898861713,
 '1015.hdf5': -0.14756312175868425,
 '1018.hdf5': -0.14034448557248091,
 '1019.hdf5': -0.15099338250107108,
 '102.hdf5': -0.1424163049911238,
 '1021.hdf5': -0.15477059314276204,
 '1022.hdf5': -0.11941736553374938,
 '1023.hdf5': -0.15543424703208056,
 '1024.hdf5': -0.15568929864391046,
 '1026.hdf5': -0.1543505108796985,
 '1030.hdf5': -0.15567761627118118,
 '1031.hdf5': -0.14934271192283155,
 '1034.hdf5': -0.14823480617047421,
 '1035.hdf5': -0.15146185592944955,
 '1036.hdf5': -0.14912540845202663,
 '1037.hdf5': -0.1550206743415555,
 '1038.hdf5': -0.15580090907603128,
 '1039.hdf5': -0.15010310110026162,
 '1040.hdf5': -0.14282050008379937,
 '1041.hdf5': -0.15572049031016535,
 '1053.hdf5': -0.15494845060126658,
 '1054.hdf5': -0.14578762781601012,