## Clustering of the Neighborhoods in Houston City, Texas

### Section 1: Data collection and data loading into notebook

#### a) Population data of Houston city, TX (1850 - 2010)

In [197]:
import pandas as pd
df_ph = pd.read_csv('population_houston_city.csv')
df_ph.head()

Unnamed: 0,year,population
0,1850,2396
1,1860,4845
2,1870,9382
3,1880,16513
4,1890,27557


#### b) List of Houston neighborhoods with lattitude and longitude

In [198]:
df_n = pd.read_csv('Houston_Neighborhoods.csv')
df_n.head()

Unnamed: 0,HoustonNeighborhoods,Index,Latitude,Longitude
0,9_Addicks_Park_Ten,9,29.8133,-95.6455
1,23_AftonOaks_RiverOaks,23,29.749994,-95.433234
2,25_Alief,25,29.6827,-95.5932
3,34_Astrodome_Area,34,29.685045,-95.409813
4,30_Braeburn,30,29.682779,-95.53498


#### c) Loading of 73 uploaded PDF files into notebook

##### c.1 convert PDF into text using PDFMINER.six

In [199]:
 #!conda install -c conda-forge pdfminer --yes
!conda install -c conda-forge pdfminer.six --yes

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [200]:
import io

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage


def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)



    fp.close()
    device.close()
    text = retstr.getvalue()
    retstr.close()
    return text

In [201]:
import numpy as np
neighbor = df_n.loc[:,'HoustonNeighborhoods'].to_numpy()
filename = []
for i in range(len(neighbor)):
    fname = str(neighbor[i]) + '_v2.pdf'
    filename.append(fname)

### Section 2: Data preparation and generation of dataframe

#### a) parse feature data from text

In [202]:
#function of searching for the various information from the text 

def parse_value(tt):
    begin = tt.find('characteristics')
    text = tt[begin:]
    list1 = []
#find the Median Household Income for 2015
    i1 = text.find('Median Household Income')
    tt_i1 = text[i1:]
    i2 = tt_i1.find('$')
    tt_i2 = tt_i1[i2+1:]
    i3 = tt_i2.find('$')
    tt_i3 = tt_i2[i3:]
    i4a = tt_i3.find('$')
    i4b = tt_i3.find(',')
    i4c = tt_i3.find(' ')
    if i4b<i4c:
        tt_i4 = tt_i3[i4a+1:i4b] + tt_i3[i4b+1:i4c]
    else:
        tt_i4 = tt_i3[i4a+1:i4c]
    if tt_i4 == ' ':
        Income_Median = float('NaN')
    else:
        Income_Median = float(tt_i4)
    list1.append(Income_Median)
    #print('Income_Median = ', Income_Median)

#find the Median Housing Value
    j1 = text.find('Median Housing Value')
    tt_j1 = text[j1:]
    j2 = tt_j1.find('$')
    tt_j2 = tt_j1[j2+1:]
    j3 = tt_j2.find('$')
    tt_j3 = tt_j2[j3:]
    j4a = tt_j3.find('$')
    j4b = tt_j3.find(',')
    j4c = tt_j3.find(' ')
    if j4b<j4c:
        tt_j4 = tt_j3[j4a+1:j4b] + tt_j3[j4b+1:j4c]
    else:
        tt_j4 = tt_j3[j4a+1:j4c]
    if tt_j4 == '':
        Housing_Median = float('NaN')
    else:
        Housing_Median = float(tt_j4)
    list1.append(Housing_Median)
    #print('Housing_Median = ', Housing_Median)

#find the total population and persons per sq. mile
    k1 = text.find('population')
    tt_k1 = text[k1:]
    k2 = tt_k1.find(' ')
    tt_k2 = tt_k1[k2+1:]
    k3 = tt_k2.find(' ')
    tt_k3 = tt_k2[k3+1:]
    k3b = tt_k3.find(' ')
    tt_k3b = tt_k3[0:k3b]
    k3c = tt_k3b.find(',')
    if k3c == -1:
        tt_k3b = tt_k3[0:k3b]
    else:
        tt_k3b = tt_k3[0:k3c] + tt_k3[k3c+1:k3b]
    if tt_k3b == '':
        Population_Total = float('NaN')
    else:
        Population_Total = float(tt_k3b)
        
    k4 = tt_k2.find('sq. mile')
    tt_k4 = tt_k2[k4+5:]
    k5 = tt_k4.find(' ')
    tt_k5 = tt_k4[k5+1:]
    k6a = tt_k5.find(' ')
    tt_k6 = tt_k5[k6a+1:]
    k6b = tt_k6.find(' ')
    tt_k6b = tt_k6[0:k6b]
    k6c = tt_k6b.find(',')
    if k6c == -1:
        tt_k6b = tt_k6[0:k6b]
    else:
        tt_k6b = tt_k6[0:k6c] + tt_k6[k6c+1:k6b]
    Psmile_Person = float(tt_k6b)
    list1.append(Population_Total)
    list1.append(Psmile_Person)
    #print('Population_Total = ', Population_Total)
    #print('Psmile_Person = ', Psmile_Person)

#find all the percentage values into a list
    l1 = text.find('%')
    l1end = text.find('Median Housing Value')
    tt_l1 = text[l1-2:l1end]  
    for i in range(len(tt_l1)):
        if tt_l1[i] == '%':
            list1.append(float(tt_l1[i-2:i]))
    list2 = list1[0:4]
    for kk in range(17):
        index = 4*kk+5
        list2.append(list1[index])
    
    #print(list1)
    return list2

In [203]:
neighbor = df_n.loc[:,'HoustonNeighborhoods'].to_numpy()
data = {}
for i in range(len(filename)):
    #print('the current file is: ',filename[i])
    tt = convert_pdf_to_txt(filename[i])
    data[neighbor[i]] = parse_value(tt)

#### b) generate and combine datafame

In [204]:
df_data = pd.DataFrame.from_dict(data, orient='index',
                                 columns=['Income_Median', 'Housing_Median', 'Population_Total',
                                          'Psmile_Person', 'Age%_Under5', 'Age%_5_17','Age%_18_64',
                                          'Age%_65up', 'Non_Hispanic%_W', 'Non_Hispanic%_B',
                                          'Hispanic%', 'Non_Hispanic%_A', 'Nin_Hispanic%_O',
                                          'Income%_U25k', 'Income%_U50k', 'Income%_U100k',
                                          'Income%_100kup', 'No_Diploma%', 'High_School%',
                                          'College%', 'Bachelor_Or_Higher%']
                                )

df_data['Index'] = df_n['Index'].to_numpy()
dft = pd.merge(df_n, df_data, on='Index')
dft.head()

Unnamed: 0,HoustonNeighborhoods,Index,Latitude,Longitude,Income_Median,Housing_Median,Population_Total,Psmile_Person,Age%_Under5,Age%_5_17,Age%_18_64,Age%_65up,Non_Hispanic%_W,Non_Hispanic%_B,Hispanic%,Non_Hispanic%_A,Nin_Hispanic%_O,Income%_U25k,Income%_U50k,Income%_U100k,Income%_100kup,No_Diploma%,High_School%,College%,Bachelor_Or_Higher%
0,9_Addicks_Park_Ten,9,29.8133,-95.6455,80584.0,168155.0,19683.0,840.0,5.0,17.0,71.0,7.0,41.0,15.0,32.0,12.0,1.0,11.0,24.0,33.0,32.0,9.0,19.0,27.0,39.0
1,23_AftonOaks_RiverOaks,23,29.749994,-95.433234,95682.0,499169.0,14518.0,4021.0,5.0,14.0,61.0,20.0,77.0,4.0,10.0,7.0,2.0,9.0,10.0,23.0,58.0,3.0,4.0,19.0,74.0
2,25_Alief,25,29.6827,-95.5932,41833.0,90655.0,106657.0,7544.0,7.0,20.0,65.0,8.0,9.0,22.0,49.0,19.0,1.0,32.0,33.0,25.0,9.0,35.0,26.0,24.0,15.0
3,34_Astrodome_Area,34,29.685045,-95.409813,46284.0,102268.0,18223.0,4846.0,5.0,6.0,84.0,5.0,34.0,22.0,10.0,31.0,3.0,26.0,26.0,32.0,15.0,2.0,7.0,20.0,72.0
4,30_Braeburn,30,29.682779,-95.53498,42958.0,116547.0,18843.0,4711.0,8.0,18.0,63.0,10.0,24.0,16.0,55.0,5.0,0.0,41.0,26.0,22.0,11.0,28.0,30.0,22.0,20.0
