# Google Colab attempt - Using Geo Location

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
lexical_fields = {
    'inflation with economic terms': ['inflation', 'hyperinflation', 'deflation', 'stagflation', 'price index',
                                      'monetary policy', 'purchasing power','deflationary', 'anti-inflationary', 'anti-deflationary'],

    'expensive': ['Expensive', 'expensive', 'prohibitive', 'costly', 'high', 'exorbitant', 'unaffordable', 'consequential',
                  'inaccessible', 'excessive', 'abnormal', 'expensive', 'rip-off', 'rip-off', 'ruinous', 'outrageous', 'out of reach',
                  'roundabout', 'inconceivable', 'prohibitive'],

    'cheap': ['Low', 'modest', 'advantageous','discounted', 'unbeatable', 'derisory','attractive', 'bargain', 'bargain price',
              'attractive', 'bargain', 'affordable','reasonable', 'competitive','accessible', 'acceptable', 'normal',
              'fair', 'interesting', 'suitable','negligible'],

    'prices_costs': ['price', 'cost', 'expense', 'fee', 'charge', 'rate', 'tariff','sale', 'purchase', 'lease', 'fee',
                     'subscription', 'bill', 'cost', 'charge', 'pay', 'rate', 'sell', 'quote', 'payment','discount'],

    'statistical_institutions': ['Bureau of Labor Statistics', 'Consumer Price Index', 'Federal Reserve',
                                  'ECB', 'central bank', 'Banque de France', 'INSEE', 'FED', 'rate', 'interest rate', 'Central Bank of Ireland',
                                 'Bank of England', 'Bank', 'Investment Institution'],

    'additional_keywords': ['economy', 'market', 'value', 'money', 'finance',]
}

In [None]:
import os
import json
import gzip  # Import gzip for handling .gz (gzipped) files

# json_dir is the directory that contains a day's worth of files
# This directory should be below the directory where this script is stored:
json_dir = '/content/gdrive/MyDrive/November 21st 2022'

outfolder = 'output'
if not os.path.exists(outfolder):
    os.makedirs(outfolder)

json_list = []  # list of lists, each sublist has 1 string element = 1 line

for path, dirs, files in os.walk(json_dir):
    for f in files:
        if f.endswith('.json') or f.endswith('.gz'):  # Check for .json and .gz files
            json_file = os.path.join(path, f)
            if f.endswith('.gz'):  # If the file is a .gz file, decompress it first
                with gzip.open(json_file, 'rt', encoding='utf-8') as jf:  # Open and read the .gz file
                    jfile_list = list(jf)
                    json_list.extend(jfile_list)
            else:  # For .json files, process as before
                with open(json_file, 'r', encoding='utf-8') as jf:
                    jfile_list = list(jf)
                    json_list.extend(jfile_list)

print("Finished reading", len(json_list), 'records into list')
print('Converting geocoded JSONL records to dictionary now...')

geo_dict = {}  # dictionary of dicts, each dict has line parsed into keys / values
i = 0
for json_str in json_list:
    result = json.loads(json_str)  # convert line / string to dict
    if result.get('geo') is not None:  # only take records that were geocoded
        geo_dict[result['id']] = result
    i += 1
    if i % 100000 == 0:
        print('Processed', i, 'records...')

print('Finished processing', i, 'records.')
print('Created dictionary with', len(geo_dict), 'geocoded records...')

# Define the output file name
outfile = 'all_records_with_geo_v21.json'

# Correctly construct the output path by joining the output folder and the output file name
outpath = os.path.join(outfolder, outfile)

print('Writing output for all geo records...')
with open(outpath, 'w', encoding='utf-8') as outf:
    json.dump(geo_dict, outf)

print('Wrote output file - Done!')

Finished reading 4190001 records into list
Converting geocoded JSONL records to dictionary now...
Processed 100000 records...
Processed 200000 records...
Processed 300000 records...
Processed 400000 records...
Processed 500000 records...
Processed 600000 records...
Processed 700000 records...
Processed 800000 records...
Processed 900000 records...
Processed 1000000 records...
Processed 1100000 records...
Processed 1200000 records...
Processed 1300000 records...
Processed 1400000 records...
Processed 1500000 records...
Processed 1600000 records...
Processed 1700000 records...
Processed 1800000 records...
Processed 1900000 records...
Processed 2000000 records...
Processed 2100000 records...
Processed 2200000 records...
Processed 2300000 records...
Processed 2400000 records...
Processed 2500000 records...
Processed 2600000 records...
Processed 2700000 records...
Processed 2800000 records...
Processed 2900000 records...
Processed 3000000 records...
Processed 3100000 records...
Processed 32

In [None]:
import json, csv, os

# The input file is already correctly specified with its full path.
infile = '/content/output/all_records_with_geo_v21.json'

# Load the JSON data directly from the infile, no need to modify the path.
with open(infile, 'r', encoding='utf-8') as json_file:
    twit_data = json.load(json_file)

twit_list = []

# Process the JSON data to extract the needed information.
for k, v in twit_data.items():
    tweet_id = k
    timestamp = v.get('created_at')
    tweet = v.get('text')
    #source = v.get('source')
    #source_url = source.split('"')[1] if source else None
    #source_name = source.split('>')[-1].split('<')[0] if source else None
    lang = v.get('lang')
    longitude = v.get('geo')['coordinates'][1] if v.get('geo') else None
    latitude = v.get('geo')['coordinates'][0] if v.get('geo') else None
    country = v.get('place')['country'] if v.get('place') else None
    ccode = v.get('place')['country_code'] if v.get('place') else None
    #place_sht = v.get('place')['name'] if v.get('place') else None
    #place_lng = v.get('place')['full_name'] if v.get('place') else None
    #user_id = v.get('user')['id']
    #user_name = v.get('user')['name']
    #user_desc = v.get('user')['description']
    #user_loc = v.get('user')['location']
    #user_created = v.get('user')['created_at']
    #followers = v.get('user')['followers_count']
    #friends = v.get('user')['friends_count']

    record = [tweet_id, timestamp, tweet, lang, longitude, latitude, country, ccode]
    twit_list.append(record)
    #[source_url, source_name, place_sht, place_lng, user_id, user_name, user_desc, user_loc, user_created, followers, friends]


outfile = 'november_21st.csv'

# Ensure the output directory exists before writing the file.
output_dir = 'output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

outpath = os.path.join(output_dir, outfile)

# Write the output CSV file.
with open(outpath, 'w', newline='', encoding='utf-8') as writefile:
    writer = csv.writer(writefile, quoting=csv.QUOTE_ALL, delimiter=',')
    header = ['tweet_id', 'timestamp', 'tweet', 'lang', 'longitude', 'latitude', 'country','ccode']
    #['source_url', 'source_name', 'place_sht', 'place_lng', 'user_id', 'user_name', 'user_desc', 'user_loc','user_created', 'followers', 'friends']
    writer.writerow(header)
    writer.writerows(twit_list)

print('Done!')

Done!
