### APANPS 5430: Applied Text and Natural Language Analytics, Fall 2020

## Assignment 2: JSON Objects

Submitted by - 
#### Harsh Dhanuka, hd2457

In [1]:
import pandas as pd
import numpy as np
import json
from pprint import pprint

from io import StringIO
from pandas_schema import Column, Schema

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)

# 1. Reads JSON newsfeeds from the data file into a Python array of dictionaries or a Pandas dataframe

### I am using the `Netflix` data for my assignment

In [2]:
# Read the JSON file as array dictionary

file = '/Users/harshdhanuka/Desktop/Columbia Class Matter/SEM 3/5430 Applied Text NLP/Assignment 2/webhose_netflix.json'

json_data=open(file).readlines()
newsfeeds = []

for line in json_data:
    newsfeeds.append(json.loads(line))

In [3]:
# Convert it to Pandas dataframe

df = pd.DataFrame.from_dict(newsfeeds)

In [4]:
# Check the basic column/feature types

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25288 entries, 0 to 25287
Data columns (total 18 columns):
thread                  25288 non-null object
uuid                    25288 non-null object
url                     25288 non-null object
ord_in_thread           25288 non-null int64
parent_url              0 non-null object
author                  25288 non-null object
published               25288 non-null object
title                   25288 non-null object
text                    25288 non-null object
highlightText           25288 non-null object
highlightTitle          25288 non-null object
highlightThreadTitle    25288 non-null object
language                25288 non-null object
external_links          25288 non-null object
external_images         25288 non-null object
entities                25288 non-null object
rating                  0 non-null object
crawled                 25288 non-null object
dtypes: int64(1), object(17)
memory usage: 3.5+ MB


In [5]:
# Check the 1st column

df.head(1)

Unnamed: 0,thread,uuid,url,ord_in_thread,parent_url,author,published,title,text,highlightText,highlightTitle,highlightThreadTitle,language,external_links,external_images,entities,rating,crawled
0,{'uuid': 'f890670c1406310226c8e9420a798d81420c...,f890670c1406310226c8e9420a798d81420c5a96,http://omgili.com/ri/.wHSUbtEfZSCvFgWhG.N__Y_k...,0,,stuff.co.nz,2020-06-03T22:49:00.000+03:00,13 Reasons Why: The popular Netflix show's cre...,The controversial 13 Reasons Why is returning ...,,,,english,[],[],"{'persons': [{'name': 'hannah baker', 'sentime...",,2020-06-03T05:55:53.024+03:00


### Convert the `published` column to appropriate format

In [6]:
df['published'] =  pd.to_datetime(df['published'])

#### Seems like the 1st column `thread` and the `uuid` are the same values. 

# 2. Prints the schema of the JSON object

In [7]:
print(" ")
print("The schema of the JSON object is as follows:")
print(" ")
pd.io.json.build_table_schema(data = df)

 
The schema of the JSON object is as follows:
 


{'fields': [{'name': 'index', 'type': 'integer'},
  {'name': 'thread', 'type': 'string'},
  {'name': 'uuid', 'type': 'string'},
  {'name': 'url', 'type': 'string'},
  {'name': 'ord_in_thread', 'type': 'integer'},
  {'name': 'parent_url', 'type': 'string'},
  {'name': 'author', 'type': 'string'},
  {'name': 'published', 'type': 'datetime', 'tz': None},
  {'name': 'title', 'type': 'string'},
  {'name': 'text', 'type': 'string'},
  {'name': 'highlightText', 'type': 'string'},
  {'name': 'highlightTitle', 'type': 'string'},
  {'name': 'highlightThreadTitle', 'type': 'string'},
  {'name': 'language', 'type': 'string'},
  {'name': 'external_links', 'type': 'string'},
  {'name': 'external_images', 'type': 'string'},
  {'name': 'entities', 'type': 'string'},
  {'name': 'rating', 'type': 'string'},
  {'name': 'crawled', 'type': 'string'}],
 'primaryKey': ['index'],
 'pandas_version': '0.20.0'}

#### Almost all of the variables are in 'string' format, except a few which are integer.

## 3. Prints the number of newsfeeds (JSON objects) in the collection

In [8]:
print(" ")
print("There are ",df.shape[0]," rows/number of newsfeeds in the JSON object file")
print(" ")
print("There are ",df.shape[1]," columns/number of variables in the JSON object file")
print(" ")

 
There are  25288  rows/number of newsfeeds in the JSON object file
 
There are  18  columns/number of variables in the JSON object file
 


In [9]:
# Verify

df.shape

(25288, 18)

# 4. Creates a set of unique newsfeeds by `title` and prints the new total collection size

In [10]:
# Remove duplicate entries, and make a subset based on 'title'

df = df.drop_duplicates(subset = 'title', keep = 'last', inplace = False)

# keep = Last as I need to keep the latest entries. 

In [11]:
# Check the new data set

df.head(1)

Unnamed: 0,thread,uuid,url,ord_in_thread,parent_url,author,published,title,text,highlightText,highlightTitle,highlightThreadTitle,language,external_links,external_images,entities,rating,crawled
0,{'uuid': 'f890670c1406310226c8e9420a798d81420c...,f890670c1406310226c8e9420a798d81420c5a96,http://omgili.com/ri/.wHSUbtEfZSCvFgWhG.N__Y_k...,0,,stuff.co.nz,2020-06-03 22:49:00+03:00,13 Reasons Why: The popular Netflix show's cre...,The controversial 13 Reasons Why is returning ...,,,,english,[],[],"{'persons': [{'name': 'hannah baker', 'sentime...",,2020-06-03T05:55:53.024+03:00


In [12]:
print(" ")
print("There are ",df.shape[0]," 'unique' rows/number of newsfeeds/collections in the JSON object file")
print(" ")
print("There are ",df.shape[1]," columns/number of variables in the JSON object file")
print(" ")

 
There are  19514  'unique' rows/number of newsfeeds/collections in the JSON object file
 
There are  18  columns/number of variables in the JSON object file
 


In [13]:
# Verify

df.shape

(19514, 18)

# 5. Prints the latest 100 article titles, urls, and publish dates

In [14]:
# Sort by the 'published date' column

df = df.sort_values(by = 'published', ascending = False)

In [15]:
# Create a new df with only the 3 required columns

df2 = df[['title', 'url', 'published']].copy()
df2.head(100)

# I am printing the first 100 rows as required.

Unnamed: 0,title,url,published
0,13 Reasons Why: The popular Netflix show's cre...,http://omgili.com/ri/.wHSUbtEfZSCvFgWhG.N__Y_k...,2020-06-03 22:49:00+03:00
2,A TV reboot of Bong Joon-ho's acclaimed film S...,http://omgili.com/ri/.wHSUbtEfZSvmJKugzHb_f4zA...,2020-06-03 07:33:00+03:00
3,2-Pack: Ideaworks Mosquito Killer Lamps (batte...,http://omgili.com/ri/.wHSUbtEfZTqX_2tXJqGoKlaY...,2020-06-03 07:00:00+03:00
4,Already-Obese Average Americans Have Drunk & E...,http://omgili.com/ri/.0rSU5LtMgwxhVu4uw52IUGPG...,2020-06-03 06:45:00+03:00
5,"Netflix, Disney join other big brands in suppo...",http://omgili.com/ri/.wHSUbtEfZThiio0hiTRCuDRH...,2020-06-03 06:39:00+03:00
6,Novel Entertainment's First Animated Feature-L...,http://omgili.com/ri/.wHSUbtEfZSKGAN5cNsgLsyo0...,2020-06-03 06:06:00+03:00
7,Anime Based On Best-Selling 1973 Disaster Nove...,http://omgili.com/ri/.wHSUbtEfZQWtoCKOsZc0dhs0...,2020-06-03 05:58:00+03:00
8,Tiger King star Carole Baskin’s dead husband’s...,http://omgili.com/ri/.wHSUbtEfZQD1RVKFqxhqwVEB...,2020-06-03 05:57:00+03:00
9,All about Netflix’s sci-fi television shows we...,http://omgili.com/ri/.wHSUbtEfZRrqAH1UjXSpXrco...,2020-06-03 05:42:00+03:00
10,"File:Federation starbase, 2230s.png",http://omgili.com/ri/.wHSUbtEfZQzgU.BYEgsHJEur...,2020-06-03 05:30:00+03:00
