# Pulitzer Insights

Q1. Which newspaper is getting the most number of Pulitzer prices?
Q2. What are the top 5 topics?

Reading data file from GCP.

In [2]:
import pandas as pd
from google.cloud import storage

import datetime as dt
from datetime import datetime
from pytz import timezone

import uuid

#Reading Google Buckets for files
client = storage.Client()
bucket=client.get_bucket('capstone_project_sr')
blob = storage.Blob('pulitzer.csv',bucket)
with open('pulitzer.csv', 'wb') as file_obj:
    blob.download_to_file(file_obj)
df=pd.read_csv('pulitzer.csv',sep=',',header=0, \
               names=['Newspaper','DailyCirculation_2004',\
               'DailyCirculation_2013',\
               'ChangeInDailyCirculation_2004_2013',\
               'WinnersAndFinalists_1990_2003',\
               'WinnersAndFinalists_2004_2014',\
               'WinnersAndFinalists_1990_2014'])

Adding basic data audit fields just in case we run into conflicts later.

In [3]:
tz = timezone('EST') # adding time zone info
datetime.now(tz) 
df['Entrydate'] = dt.datetime.now()

df.insert(0,'Id',uuid.uuid4()) 
df.Id= df.Id.apply(lambda x: uuid.uuid4()) # adding unique identifier

In [4]:
df.info() # checking the data frame structure

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
Id                                    50 non-null object
Newspaper                             50 non-null object
DailyCirculation_2004                 50 non-null object
DailyCirculation_2013                 50 non-null object
ChangeInDailyCirculation_2004_2013    50 non-null object
WinnersAndFinalists_1990_2003         50 non-null int64
WinnersAndFinalists_2004_2014         50 non-null int64
WinnersAndFinalists_1990_2014         50 non-null int64
Entrydate                             50 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 3.6+ KB


### Inserting data into Cassandra database for persistence and reliability among other benefits.

Reading the connection points details.

In [8]:
df_con=pd.read_csv('connection_point.csv',header=0)

In [22]:
import itertools
from multiprocessing import Pool
import sys
import time
from cassandra.cluster import Cluster
from cassandra.concurrent import execute_concurrent_with_args
from cassandra.query import tuple_factory
from cassandra.auth import PlainTextAuthProvider

def _insertData(params):
    cluster = Cluster(contact_points=[df_con.ip[0]], auth_provider = \
                      PlainTextAuthProvider(username=df_con.user[0], \
                                            password=df_con.token[0]))
    session = cluster.connect()
    session.set_keyspace('capstone')
    session.row_factory = tuple_factory
    prepared=session.prepare("INSERT INTO capstone.pulitzer \
                             (id,Newspaper,DailyCirculation_2004,DailyCirculation_2013, \
                             ChangeDailyCirculation_2004_2013,WinNFinalists_1990_2003, \
                             WinNFinalists_2004_2014,WinNFinalists_1990_2014,Entrydate) \
                             VALUES (?,?,?,?,?,?,?,?,?)")
    
    #using datastax driver for multiprocessing 
    execute_concurrent_with_args(session, prepared, params, concurrency=50) 
    return None

def multiprocess(params):
    pool = Pool(processes=4)
    results = [pool.map(_insertData, (params[n:n+100],)) for n in range(0, len(params),100)]
    return results
    

if __name__ == "__main__":
    parameters=[]
    for index, row in enumerate(df.values):        
        (a,b,c,d,e,f,g,h,i) = row
        row1=(a,str(b),str(c),str(d),str(e),str(f),str(g),str(h),i)
        parameters.append(row1)           
    a = multiprocess(parameters)

### Reading GDP by state and contry

In [23]:
import os

In [28]:
files = [file for file in os.listdir( './Datascience/DataScience/CapstoneProjects/Data' ) \
         if file.startswith("usgs_1957_2015")]
gdp_merged=pd.DataFrame()

for file_ in files:
    filename='./Datascience/DataScience/CapstoneProjects/Data/'+file_
    df = pd.read_csv(filename,skiprows=0,header=1,skipfooter=6,error_bad_lines=False, engine='python', usecols=range(0,3))
    df['State']=df.columns[2].split('-')[1][:2]
    del df[df.columns[2]]
    df.columns=['year','GDP-billion','state']
    gdp_merged=pd.concat([gdp_merged,df],ignore_index=1,axis=0)

Skipping line 67: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 68: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 69: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 72: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 73: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library ha

Skipping line 67: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 68: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 69: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 72: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library has parsed all rows).
Skipping line 73: ',' expected after '"'. Error could possibly be due to parsing errors in the skipped footer rows (the skipfooter keyword is only applied after Python's csv library ha

In [32]:
tz = timezone('EST') # adding time zone info
datetime.now(tz) 
gdp_merged['Entrydate'] = dt.datetime.now()

gdp_merged.insert(0,'Id',uuid.uuid4()) 
gdp_merged.Id= gdp_merged.Id.apply(lambda x: uuid.uuid4()) # adding unique identifier

In [34]:
def _insertData(params):
    cluster = Cluster(contact_points=[df_con.ip[0]], auth_provider = \
                      PlainTextAuthProvider(username=df_con.user[0], \
                                            password=df_con.token[0]))
    session = cluster.connect()
    session.set_keyspace('capstone')
    session.row_factory = tuple_factory
    prepared=session.prepare("INSERT INTO capstone.GDP(id,year,GDP,state,entrydate) VALUES (?,?,?,?,?)")
    
    #using datastax driver for multiprocessing 
    execute_concurrent_with_args(session, prepared, params, concurrency=50) 
    return None

def multiprocess(params):
    pool = Pool(processes=4)
    results = [pool.map(_insertData, (params[n:n+100],)) for n in range(0, len(params),100)]
    return results
    

if __name__ == "__main__":
    parameters=[]
    for index, row in enumerate(gdp_merged.values):        
        (a,b,c,d,e) = row
        row1=(a,str(b),int(c),str(d),i)
        parameters.append(row1)           
    a = multiprocess(parameters)

In [44]:
blob = storage.Blob('CrimeIndex.xlsx',bucket)
with open('Crimeindex.xlsx', 'wb') as file_obj:
    blob.download_to_file(file_obj)
    
xl=pd.ExcelFile('Crimeindex.xlsx')
df_crime=xl.parse('Sheet1',header=0)

In [46]:
tz = timezone('EST') # adding time zone info
datetime.now(tz) 
df_crime['Entrydate'] = dt.datetime.now()

df_crime.insert(0,'Id',uuid.uuid4()) 
df_crime.Id= df_crime.Id.apply(lambda x: uuid.uuid4()) # adding unique identifier

In [57]:
df_crime.columns=['Id','rank','crimeindex','state-population','Entrydate']

In [83]:
df_crime.insert(4,'state','')
df_crime['state']=df_crime['state-population'].apply(lambda x: x.split('/')[0][:-1])

In [85]:
df_crime.insert(5,'population','')
df_crime['population']=df_crime['state-population'].apply(lambda x: x.split('/')[1].strip())

In [88]:
del df_crime[df_crime.columns[3]]

In [89]:
df_crime#['state-population'].values[0].split('/')[1].strip()

Unnamed: 0,Id,rank,crimeindex,state,population,Entrydate
0,7fa8d8bc-6df4-474b-897c-bad1880e3b2e,1,950,Vermont,626358,2017-10-22 23:05:01.480357
1,e9599ed6-8805-4f31-a07b-2149b6690b4a,2,956,New Hampshire,1321069,2017-10-22 23:05:01.480357
2,669e8915-a451-49f3-8b0a-9b8202256ae5,3,985,North Dakota,704925,2017-10-22 23:05:01.480357
3,d2919cd0-337c-4144-bdc7-deef18b5789b,4,1017,Maine,1328535,2017-10-22 23:05:01.480357
4,b1909949-f893-4257-85f1-4d284245d826,5,1042,South Dakota,834708,2017-10-22 23:05:01.480357
5,ebd33c0e-8506-49df-a5a3-d62e9c9061bf,6,1068,Idaho,1599464,2017-10-22 23:05:01.480357
6,01788308-b9e4-4be9-83ca-7005561d114b,7,1125,Virginia,8185131,2017-10-22 23:05:01.480357
7,a98805b2-bc79-48fe-985b-0138cf4c9a36,8,1136,Wyoming,575251,2017-10-22 23:05:01.480357
8,b774e183-9c16-45a9-80fb-78e00c59c17a,9,1171,Connecticut,3592053,2017-10-22 23:05:01.480357
9,f0f7cc57-abfc-410d-8df6-5ec48b42ebdf,10,1180,New Jersey,8874374,2017-10-22 23:05:01.480357


In [100]:
def _insertData(params):
    cluster = Cluster(contact_points=[df_con.ip[0]], auth_provider = \
                      PlainTextAuthProvider(username=df_con.user[0], \
                                            password=df_con.token[0]))
    session = cluster.connect()
    session.set_keyspace('capstone')
    session.row_factory = tuple_factory
    prepared=session.prepare("INSERT INTO capstone.crimeindex(id,rank,crimeindex,state,population,entrydate)\
                              VALUES (?,?,?,?,?,?)")
    
    #using datastax driver for multiprocessing 
    execute_concurrent_with_args(session, prepared, params, concurrency=50) 
    return None

def multiprocess(params):
    pool = Pool(processes=4)
    results = [pool.map(_insertData, (params[n:n+100],)) for n in range(0, len(params),100)]
    return results
    

if __name__ == "__main__":
    parameters=[]
    for index, row in enumerate(df_crime.values):        
        (a,b,c,d,e,f) = row
        row1=(a,int(b),int(c),str(d),int(e.replace(',','')),f)
        parameters.append(row1)           
    a = multiprocess(parameters)

In [103]:
blob = storage.Blob('table_1_crime_in_the_united_states_by_volume_and_rate_per_100000_inhabitants_1995-2014.xls',bucket)
with open('table_1_crime_in_the_united_states_by_volume_and_rate_per_100000_inhabitants_1995-2014.xls', 'wb') as file_obj:
    blob.download_to_file(file_obj)

In [104]:
filename='table_1_crime_in_the_united_states_by_volume_and_rate_per_100000_inhabitants_1995-2014.xls'
xl=pd.ExcelFile(filename)
df_crime_byvol=xl.parse('14tbl01',skiprows=3,header=0,skipfooter=10,error_bad_lines=False, engine='python', usecols=range(0,4))

In [106]:
df_crime_byvol['Year']=df_crime_byvol.Year.apply(lambda x: int(str(x)[:4]))

In [108]:
df_crime_byvol.columns=['year','population','violentcrime','violentcrimerate']

In [110]:
tz = timezone('EST') # adding time zone info
datetime.now(tz) 
df_crime_byvol['Entrydate'] = dt.datetime.now()

df_crime_byvol.insert(0,'Id',uuid.uuid4()) 
df_crime_byvol.Id= df_crime.Id.apply(lambda x: uuid.uuid4()) # adding unique identifier

In [111]:
df_crime_byvol

Unnamed: 0,Id,year,population,violentcrime,violentcrimerate,Entrydate
0,921006ea-e4ad-412f-b0c8-9a494feb08fb,1995,262803276,1798792,684.5,2017-10-23 00:27:37.706506
1,3cda340b-794f-49da-9b73-82fdd189c67d,1996,265228572,1688540,636.6,2017-10-23 00:27:37.706506
2,8d58a704-0aa5-46a8-9868-e581592bffc1,1997,267783607,1636096,611.0,2017-10-23 00:27:37.706506
3,bfac0bd3-8fc6-4cf0-b5bd-16a9a35117fa,1998,270248003,1533887,567.6,2017-10-23 00:27:37.706506
4,80be7628-bd41-44c6-87fb-310667d546cc,1999,272690813,1426044,523.0,2017-10-23 00:27:37.706506
5,69f9c891-aa66-40e3-81d7-b2cd728418be,2000,281421906,1425486,506.5,2017-10-23 00:27:37.706506
6,e1fd6370-291f-44e4-b742-5b34689a5d87,2001,285317559,1439480,504.5,2017-10-23 00:27:37.706506
7,c5030888-1574-49e9-baea-48410b12a52d,2002,287973924,1423677,494.4,2017-10-23 00:27:37.706506
8,edce9436-225c-4d11-99b9-ca8d85bbfcfc,2003,290788976,1383676,475.8,2017-10-23 00:27:37.706506
9,1b4b77c0-7642-4125-869a-d2e6b5ba52a9,2004,293656842,1360088,463.2,2017-10-23 00:27:37.706506


In [112]:
def _insertData(params):
    cluster = Cluster(contact_points=[df_con.ip[0]], auth_provider = \
                      PlainTextAuthProvider(username=df_con.user[0], \
                                            password=df_con.token[0]))
    session = cluster.connect()
    session.set_keyspace('capstone')
    session.row_factory = tuple_factory
    prepared=session.prepare("INSERT INTO capstone.crimebyvol(id,year,population, \
                              violentcrime,violentcrimerate,entrydate) \
                              VALUES (?,?,?,?,?,?)")
    
    #using datastax driver for multiprocessing 
    execute_concurrent_with_args(session, prepared, params, concurrency=50) 
    return None

def multiprocess(params):
    pool = Pool(processes=4)
    results = [pool.map(_insertData, (params[n:n+100],)) for n in range(0, len(params),100)]
    return results
    
if __name__ == "__main__":
    parameters=[]
    for index, row in enumerate(df_crime_byvol.values):        
        (a,b,c,d,e,f) = row
        row1=(a,str(b),int(c),int(d),int(e),f)
        parameters.append(row1)           
    a = multiprocess(parameters)

In [172]:
filename='./Datascience/DataScience/CapstoneProjects/Data/st-est00int-alldata.csv'
df_=pd.read_csv(filename,header=0,usecols=[2,3,4,5,6,7,9,10,11,12,13,14,15,16,17,18])

In [173]:
df_pop=df_[(df_.SEX==0) & (df_.ORIGIN==0) & (df_.RACE==0) & (df_.AGEGRP==0)]

In [175]:
df_=df_pop.drop(['SEX', 'ORIGIN', 'RACE', 'AGEGRP'], axis=1)

In [177]:
df_.columns

Index(['STATE', 'NAME', 'POPESTIMATE2000', 'POPESTIMATE2001',
       'POPESTIMATE2002', 'POPESTIMATE2003', 'POPESTIMATE2004',
       'POPESTIMATE2005', 'POPESTIMATE2006', 'POPESTIMATE2007',
       'POPESTIMATE2008', 'POPESTIMATE2009'],
      dtype='object')

In [178]:
filename='./Datascience/DataScience/CapstoneProjects/Data/nst-est2016-alldata.csv'
df_1=pd.read_csv(filename,header=0,usecols=[1,3,4,7,9,10,11,12,13])

In [179]:
df_1.columns

Index(['REGION', 'STATE', 'NAME', 'POPESTIMATE2010', 'POPESTIMATE2012',
       'POPESTIMATE2013', 'POPESTIMATE2014', 'POPESTIMATE2015',
       'POPESTIMATE2016'],
      dtype='object')

In [194]:
df_pop_merged=pd.merge(df_1,df_,how='left', on='NAME')

In [198]:
df_pop_merged#.info()

Unnamed: 0,REGION,STATE_x,NAME,POPESTIMATE2010,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,STATE_y,POPESTIMATE2000,POPESTIMATE2001,POPESTIMATE2002,POPESTIMATE2003,POPESTIMATE2004,POPESTIMATE2005,POPESTIMATE2006,POPESTIMATE2007,POPESTIMATE2008,POPESTIMATE2009
0,0,0,United States,309348193,313998379,316204908,318563456,320896618,323127513,0.0,282162411.0,284968955.0,287625193.0,290107933.0,292805298.0,295516599.0,298379912.0,301231207.0,304093966.0,306771529.0
1,1,0,Northeast Region,55388056,55829059,55988771,56116791,56184737,56209510,,,,,,,,,,,
2,2,0,Midwest Region,66978602,67332320,67543948,67726368,67838387,67941429,,,,,,,,,,,
3,3,0,South Region,114863114,117299171,118424320,119696311,121039206,122319574,,,,,,,,,,,
4,4,0,West Region,72118421,73537829,74247869,75023986,75834288,76657000,,,,,,,,,,,
5,3,1,Alabama,4785492,4815960,4829479,4843214,4853875,4863300,1.0,4452173.0,4467634.0,4480089.0,4503491.0,4530729.0,4569805.0,4628981.0,4672840.0,4718206.0,4757938.0
6,4,2,Alaska,714031,731089,736879,736705,737709,741894,2.0,627963.0,633714.0,642337.0,648414.0,659286.0,666946.0,675302.0,680300.0,687455.0,698895.0
7,4,4,Arizona,6408312,6549634,6624617,6719993,6817565,6931071,4.0,5160586.0,5273477.0,5396255.0,5510364.0,5652404.0,5839077.0,6029141.0,6167681.0,6280362.0,6343154.0
8,3,5,Arkansas,2921995,2950685,2958663,2966912,2977853,2988248,5.0,2678588.0,2691571.0,2705927.0,2724816.0,2749686.0,2781097.0,2821761.0,2848650.0,2874554.0,2896843.0
9,4,6,California,37332685,38011074,38335203,38680810,38993940,39250017,6.0,33987977.0,34479458.0,34871843.0,35253159.0,35574576.0,35827943.0,36021202.0,36250311.0,36604337.0,36961229.0
