## Parse tweet

In [1]:
import pyproj
from shapely.geometry import Polygon
from collections import defaultdict
import re
import os
import json
import pandas as pd
import geopandas as gpd
import pandasql as ps
import warnings

STATE_CODE = {1: "new south wales", 2: "victoria", 3: "queensland",
            4: "south australia", 5: "western australia", 6: "tasmania",
            7: "northern territory", 8: "australian capital territory", 
            9: "offshore territories"}
DROP_PROTO = ("new south wales, australia", "victoria, australia", "queensland, australia",
            "south australia, australia", "western australia, australia", "tasmania, australia",
            "northern territory, australia", "australian capital territory, australia", 
            "offshore territories, australia", "australia")
OUT_DIR = "../data/filt_chunk/"   ### filtered json file location
CHUNK_PATH = "../data/raw_chunk/"   ### path to store extracted chunks
TWEET_PATH = r"/mnt/f/Downloads/twitter-huge.json" ### The path to big tweet
SUA_SHAPE = "../data/SUA/SUA_2016_AUST.shp"   ### SUA shape file
CHUNK_SIZE = 1024 * 1024 * 600  # 600 MB
# CHUNK_SIZE = 1024 * 1024 * 50  # 50 MB
# START_CHUNK = 0   ## start chunk
# END_CHUNK = 1   ## End chunk
START_CHUNK = 101   ## start chunk
END_CHUNK = 120   ## End chunk
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", UserWarning)

# read sua_file
gdf = gpd.read_file(SUA_SHAPE)
sua_gdf = gdf
sua_gdf['state'] = sua_gdf.apply(lambda x: STATE_CODE[int(x["SUA_CODE16"][0])], axis = 1)
sua_gdf["geo_back"] = sua_gdf.geometry
sua_gdf = sua_gdf.set_index("SUA_CODE16")

## prepare empty dataframe


tweet_file = open(TWEET_PATH, 'rb')
for i in range(START_CHUNK):
    chunk = tweet_file.read(CHUNK_SIZE)

city_list = []

for chunk in range(START_CHUNK,END_CHUNK):
    print("\n\n__________ Iteration {} __________\nstart searching".format(chunk))
    
    ## select chunk of data
    chunk_f = tweet_file.read(CHUNK_SIZE)
    if not chunk_f:
        break
    ## Save the chunk to a new file
    input_file = CHUNK_PATH + 'chunk_' + str(chunk) + '.json'
    with open(input_file, 'wb') as chunk_file:
        chunk_file.write(chunk_f)
    print("Chunk file saved")
    
    columns = ["id", "author_id", "content", "time", "tokens", 'sentiment', "area_name", "geometry"]
    tweet_gdf = gpd.GeoDataFrame(pd.DataFrame(columns=columns))
    tweet_gdf = tweet_gdf.set_index("id")
    tweet_gdf = tweet_gdf.set_crs({'init': 'epsg:3857'})
    
    ## reading file
    tweets_files =  open(input_file,'r',encoding='utf-8')
    print("File readed")
    tweet_str= ''
    tweets_files.readline()
    # while True:
    while True:
        ## search tweet
        new_line= tweets_files.readline()
        if len(new_line)<6:
            break
        if new_line[:6]=='{"id":' and (new_line[-5:] == "]}},\n" or new_line[-4:] == "}]}}\n"):
            if new_line[-5:] == "]}},\n":
                tweet_str = new_line[:-2]
            else:
                tweet_str = new_line[:-1]

            new_line=tweets_files.readline()
            tweet_json=json.loads(tweet_str)
            
            if "includes" in tweet_json["doc"] and "places" in tweet_json["doc"]["includes"]:
                area_name = tweet_json["doc"]["includes"]["places"][0]["full_name"].lower()
                if area_name not in DROP_PROTO:
                    location = tweet_json["doc"]["includes"]["places"][0]["geo"]["bbox"]
                    geometry = Polygon([(location[2], location[3]), (location[2], location[1]), (location[0], location[1]), (location[0], location[3])])
                    data = [
                        int(tweet_json["doc"]["data"]['author_id']), 
                        tweet_json["doc"]["data"]['text'], 
                        tweet_json["doc"]["data"]["created_at"],
                        tweet_json["value"]["tokens"], tweet_json["doc"]["data"]['sentiment'], 
                        tweet_json["doc"]["includes"]["places"][0]["full_name"].lower(), 
                        geometry
                    ]
                    tweet_gdf.loc[int(tweet_json["id"])] = data
                    city_list.append(tweet_json["doc"]["includes"]["places"][0]["full_name"].lower())
        else:
            break
    print("End searching")
    if(tweet_gdf.empty):
        print("Empty dataframe")
        continue
    tweet_gdf["area_t"] = tweet_gdf.area
    tweet_gdf.index = tweet_gdf.index.astype(int)

    sql_tweet_to_sua = "SELECT id, SUA_NAME16 AS SUA_NAME, state, MAX(intersection) AS max_inter FROM match_gdf GROUP BY id"
    match_gdf = gpd.sjoin(tweet_gdf[["geometry"]], sua_gdf[["state", "geometry", "SUA_NAME16", "geo_back"]], how='inner', op='intersects')
    match_gdf["intersection"] = match_gdf.apply(lambda x: x["geometry"].intersection(x["geo_back"]).area, axis = 1)
    match_gdf = match_gdf[["index_right", "state", "SUA_NAME16", "intersection"]].reset_index()
    match_gdf = ps.sqldf(sql_tweet_to_sua)
    merge_gdf = pd.merge(tweet_gdf, match_gdf, on = "id", how = "inner").rename(columns={"index_right": "SUA"}).drop(["max_inter", "area_t", "geometry"], axis=1)
    merge_gdf = pd.DataFrame(merge_gdf)

    # output_file = OUT_DIR + "filtered_{}.csv".format(chunk)
    # merge_gdf.to_csv(output_file)
    
    output_file = OUT_DIR + "filtered_{}.json".format(chunk)
    merge_gdf.to_json(output_file, orient='records')
    print("file saved: ", output_file)
    tweets_files.close()
    os.remove(input_file)
    print("file deleted: ", output_file)





__________ Iteration 94 __________
start searching
Chunk file saved
File readed
End searching
file saved:  ../data/filt_chunk/filtered_94.json
file deleted:  ../data/filt_chunk/filtered_94.json


__________ Iteration 95 __________
start searching
Chunk file saved
File readed
End searching
file saved:  ../data/filt_chunk/filtered_95.json
file deleted:  ../data/filt_chunk/filtered_95.json


__________ Iteration 96 __________
start searching
Chunk file saved
File readed
End searching
file saved:  ../data/filt_chunk/filtered_96.json
file deleted:  ../data/filt_chunk/filtered_96.json


__________ Iteration 97 __________
start searching
Chunk file saved
File readed
End searching
file saved:  ../data/filt_chunk/filtered_97.json
file deleted:  ../data/filt_chunk/filtered_97.json


__________ Iteration 98 __________
start searching
Chunk file saved
File readed
End searching
file saved:  ../data/filt_chunk/filtered_98.json
file deleted:  ../data/filt_chunk/filtered_98.json


__________ Iterati

## Combine data

In [13]:
import os
import pandas as pd
OUT_DIR = "../data/filt_chunk/"   ### filtered json file location
LARGE_DF_DIR = "../data/filt_tweet.json"

file_names = os.listdir(OUT_DIR)

print(file_names)
large_df = pd.read_json(OUT_DIR+file_names[0])

for i in range(1, len(file_names)):
    cur_df = pd.read_json(OUT_DIR+file_names[i])
    large_df = large_df.append(cur_df)

large_df = large_df.reset_index(drop=True)
print(large_df)
large_df.to_json(LARGE_DF_DIR, orient='records')

['filtered_56.json', 'filtered_10.json', 'filtered_43.json', 'filtered_6.json', 'filtered_48.json', 'filtered_51.json', 'filtered_12.json', 'filtered_49.json', 'filtered_71.json', 'filtered_11.json', 'filtered_5.json', 'filtered_62.json', 'filtered_35.json', 'filtered_99.json', 'filtered_41.json', 'filtered_30.json', 'filtered_52.json', 'filtered_93.json', 'filtered_68.json', 'filtered_75.json', 'filtered_40.json', 'filtered_16.json', 'filtered_4.json', 'filtered_61.json', 'filtered_53.json', 'filtered_50.json', 'filtered_64.json', 'filtered_13.json', 'filtered_63.json', 'filtered_45.json', 'filtered_39.json', 'filtered_94.json', 'filtered_38.json', 'filtered_17.json', 'filtered_79.json', 'filtered_84.json', 'filtered_21.json', 'filtered_96.json', 'filtered_70.json', 'filtered_0 copy.json', 'filtered_44.json', 'filtered_1.json', 'filtered_54.json', 'filtered_0.json', 'filtered_36.json', 'filtered_82.json', 'filtered_80.json', 'filtered_31.json', 'filtered_14.json', 'filtered_58.json', 