In [1]:
# Import dependencies
import json
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
import time
from config import db_password

In [2]:
# Read the file into pandas dataframe
file_dir = 'C:/Users/gabi_/OneDrive/Desktop/Analysis/main/First Segment Rubric - Extract/'

In [3]:
# Read the file
pittsburgh_data_raw = pd.read_csv(f'{file_dir}pittsburgh_data_raw.csv', low_memory=False)
pittsburgh_data_raw.head()

Unnamed: 0,date,title,link,price,bedroom,sqft,neighborhood,bathroom,amenities
0,Dec 31,Green Tree City/5 minutes from Downtown,https://pittsburgh.craigslist.org/apa/d/pittsb...,"$1,575",3,"available jan 2, 2",Kirsopp Avenue 5 Minutes from Downtown,2.0,"['air conditioning', 'house', 'w/d in unit', '..."
1,Dec 31,Renovated House on Pioneer ave!! Pets welcome.,https://pittsburgh.craigslist.org/apa/d/pittsb...,"$1,650",3,1426,"2048 Pioneer Avenue, Pittsburgh, PA",1.0,"['cats are OK - purrr', 'dogs are OK - wooof',..."
2,Dec 31,JANUARY AVAILABILITY!! CALL NOW!!!,https://pittsburgh.craigslist.org/apa/d/coraop...,"$1,315",1,825,"916 Beaver Grade Road, Coraopolis, PA",1.0,"['air conditioning', 'cats are OK - purrr', 'd..."
3,Dec 31,This beautifully renovated whole house.,https://pittsburgh.craigslist.org/apa/d/pittsb...,$900,3,1397,,2.0,"['apartment', 'w/d in unit', 'attached garage'..."
4,Dec 31,* Beautifully Clean & Renovated Apt* Very Larg...,https://pittsburgh.craigslist.org/apa/d/pittsb...,$700,2,,Pittsburgh,1.0,"['air conditioning', 'cats are OK - purrr', 'd..."


In [4]:
# Put data into dataframe
pittsburgh_data_df = pd.DataFrame(pittsburgh_data_raw)

In [5]:
# Get rows and columns
pittsburgh_data_df.shape

(120, 9)

In [6]:
# Create modified database 
pittsburgh_clean_df = pittsburgh_data_df
pittsburgh_clean_df.head()

Unnamed: 0,date,title,link,price,bedroom,sqft,neighborhood,bathroom,amenities
0,Dec 31,Green Tree City/5 minutes from Downtown,https://pittsburgh.craigslist.org/apa/d/pittsb...,"$1,575",3,"available jan 2, 2",Kirsopp Avenue 5 Minutes from Downtown,2.0,"['air conditioning', 'house', 'w/d in unit', '..."
1,Dec 31,Renovated House on Pioneer ave!! Pets welcome.,https://pittsburgh.craigslist.org/apa/d/pittsb...,"$1,650",3,1426,"2048 Pioneer Avenue, Pittsburgh, PA",1.0,"['cats are OK - purrr', 'dogs are OK - wooof',..."
2,Dec 31,JANUARY AVAILABILITY!! CALL NOW!!!,https://pittsburgh.craigslist.org/apa/d/coraop...,"$1,315",1,825,"916 Beaver Grade Road, Coraopolis, PA",1.0,"['air conditioning', 'cats are OK - purrr', 'd..."
3,Dec 31,This beautifully renovated whole house.,https://pittsburgh.craigslist.org/apa/d/pittsb...,$900,3,1397,,2.0,"['apartment', 'w/d in unit', 'attached garage'..."
4,Dec 31,* Beautifully Clean & Renovated Apt* Very Larg...,https://pittsburgh.craigslist.org/apa/d/pittsb...,$700,2,,Pittsburgh,1.0,"['air conditioning', 'cats are OK - purrr', 'd..."


In [7]:
#Getting null values for Austin
[[column,pittsburgh_clean_df[column].isnull().sum()] for column in pittsburgh_clean_df.columns]

[['date', 0],
 ['title', 0],
 ['link', 0],
 ['price', 0],
 ['bedroom', 0],
 ['sqft', 19],
 ['neighborhood', 2],
 ['bathroom', 0],
 ['amenities', 0]]

In [8]:
# Extract rows that say available
pittsburgh_clean_df["sqft"] = pittsburgh_clean_df["price"].str.replace(",","")

In [9]:
# Count "available on date" rows in sqft
form_one = r'available'
pittsburgh_clean_df["sqft"].str.contains(form_one, flags=re.IGNORECASE, na=False)

0      False
1      False
2      False
3      False
4      False
       ...  
115    False
116    False
117    False
118    False
119    False
Name: sqft, Length: 120, dtype: bool

In [10]:
# Delete rows that say available in sqft
pittsburgh_cleaner_df = pittsburgh_clean_df[~pittsburgh_clean_df["sqft"].str.contains(form_one, flags=re.IGNORECASE, na=False)]

In [11]:
pittsburgh_cleaner_df.head()

Unnamed: 0,date,title,link,price,bedroom,sqft,neighborhood,bathroom,amenities
0,Dec 31,Green Tree City/5 minutes from Downtown,https://pittsburgh.craigslist.org/apa/d/pittsb...,"$1,575",3,$1575,Kirsopp Avenue 5 Minutes from Downtown,2.0,"['air conditioning', 'house', 'w/d in unit', '..."
1,Dec 31,Renovated House on Pioneer ave!! Pets welcome.,https://pittsburgh.craigslist.org/apa/d/pittsb...,"$1,650",3,$1650,"2048 Pioneer Avenue, Pittsburgh, PA",1.0,"['cats are OK - purrr', 'dogs are OK - wooof',..."
2,Dec 31,JANUARY AVAILABILITY!! CALL NOW!!!,https://pittsburgh.craigslist.org/apa/d/coraop...,"$1,315",1,$1315,"916 Beaver Grade Road, Coraopolis, PA",1.0,"['air conditioning', 'cats are OK - purrr', 'd..."
3,Dec 31,This beautifully renovated whole house.,https://pittsburgh.craigslist.org/apa/d/pittsb...,$900,3,$900,,2.0,"['apartment', 'w/d in unit', 'attached garage'..."
4,Dec 31,* Beautifully Clean & Renovated Apt* Very Larg...,https://pittsburgh.craigslist.org/apa/d/pittsb...,$700,2,$700,Pittsburgh,1.0,"['air conditioning', 'cats are OK - purrr', 'd..."


In [12]:
# Drop null rows 
pittsburgh_cleaner_df = pittsburgh_cleaner_df.dropna()

In [13]:
# Drop title
pittsburgh_cleaner_df = pittsburgh_cleaner_df.drop(['title'], axis=1)
pittsburgh_cleaner_df.head()

Unnamed: 0,date,link,price,bedroom,sqft,neighborhood,bathroom,amenities
0,Dec 31,https://pittsburgh.craigslist.org/apa/d/pittsb...,"$1,575",3,$1575,Kirsopp Avenue 5 Minutes from Downtown,2.0,"['air conditioning', 'house', 'w/d in unit', '..."
1,Dec 31,https://pittsburgh.craigslist.org/apa/d/pittsb...,"$1,650",3,$1650,"2048 Pioneer Avenue, Pittsburgh, PA",1.0,"['cats are OK - purrr', 'dogs are OK - wooof',..."
2,Dec 31,https://pittsburgh.craigslist.org/apa/d/coraop...,"$1,315",1,$1315,"916 Beaver Grade Road, Coraopolis, PA",1.0,"['air conditioning', 'cats are OK - purrr', 'd..."
4,Dec 31,https://pittsburgh.craigslist.org/apa/d/pittsb...,$700,2,$700,Pittsburgh,1.0,"['air conditioning', 'cats are OK - purrr', 'd..."
5,Dec 31,https://pittsburgh.craigslist.org/apa/d/coraop...,$795,1,$795,Moon Township,1.0,"['townhouse', 'w/d hookups', 'no smoking', 'of..."


In [14]:
# Drop date
pittsburgh_cleaner_df = pittsburgh_cleaner_df.drop(['date'], axis=1)
pittsburgh_cleaner_df.head()

Unnamed: 0,link,price,bedroom,sqft,neighborhood,bathroom,amenities
0,https://pittsburgh.craigslist.org/apa/d/pittsb...,"$1,575",3,$1575,Kirsopp Avenue 5 Minutes from Downtown,2.0,"['air conditioning', 'house', 'w/d in unit', '..."
1,https://pittsburgh.craigslist.org/apa/d/pittsb...,"$1,650",3,$1650,"2048 Pioneer Avenue, Pittsburgh, PA",1.0,"['cats are OK - purrr', 'dogs are OK - wooof',..."
2,https://pittsburgh.craigslist.org/apa/d/coraop...,"$1,315",1,$1315,"916 Beaver Grade Road, Coraopolis, PA",1.0,"['air conditioning', 'cats are OK - purrr', 'd..."
4,https://pittsburgh.craigslist.org/apa/d/pittsb...,$700,2,$700,Pittsburgh,1.0,"['air conditioning', 'cats are OK - purrr', 'd..."
5,https://pittsburgh.craigslist.org/apa/d/coraop...,$795,1,$795,Moon Township,1.0,"['townhouse', 'w/d hookups', 'no smoking', 'of..."


In [15]:
# Check which columns need to be converted
pittsburgh_cleaner_df.dtypes

link             object
price            object
bedroom           int64
sqft             object
neighborhood     object
bathroom        float64
amenities        object
dtype: object

In [16]:
# Convert bedroom to integer 
pittsburgh_cleaner_df["sqft"] = pittsburgh_cleaner_df["sqft"].astype(int)


ValueError: invalid literal for int() with base 10: '$1575'

In [None]:
# Drop $ and , from price

pittsburgh_cleaner_df["price"] = pittsburgh_cleaner_df["price"].str.replace("$","")
pittsburgh_cleaner_df["price"] = pittsburgh_cleaner_df["price"].str.replace(",","")

In [None]:
# Convert price to integer
pittsburgh_cleaner_df["price"] = pittsburgh_cleaner_df["price"].astype(int)


In [None]:
pittsburgh_cleaner_df['bedroom'].value_counts()

In [None]:
pittsburgh_cleaner_df['amenities'].astype("string")

In [None]:
#Check total count of amenities
num_occurrences = pittsburgh_cleaner_df['amenities'].str.count(',').sum()
print(num_occurrences)

In [None]:
#Create count of amenities
counts = pittsburgh_cleaner_df['amenities'].apply(lambda x: x.count(','))
pittsburgh_cleaner_df['number of amenities'] = counts
pittsburgh_cleaner_df.head()

In [None]:
# Drop amenitites
pittsburgh_cleaner_df = pittsburgh_cleaner_df.drop(['amenities'], axis=1)
pittsburgh_cleaner_df.head()

In [None]:
# Remove duplicate rows
pittsburgh_cleaner_df['link'] = pittsburgh_cleaner_df['link'].str.extract(r'(\d{10}.html)')
print(len(pittsburgh_cleaner_df))
pittsburgh_cleaner_df.drop_duplicates(subset='link', inplace=True)
print(len(pittsburgh_cleaner_df))
pittsburgh_cleaner_df.head()

In [None]:
# Drop link
pittsburgh_cleaner_df = pittsburgh_cleaner_df.drop(['link'], axis=1)
pittsburgh_cleaner_df.head()

In [None]:
pittsburgh_cleaner_df.info(null_counts=True)

In [None]:
# Reset the index after dropping rows
pittsburgh_cleaner_df.reset_index(drop=True, inplace=True)
pittsburgh_cleaner_df.head()

In [None]:
# Create scatter plot to compare no of amenities to price
pittsburgh_cleaner_df.plot(x='price', y='sqft', kind='scatter')

In [None]:
# Create scatter plot to compare sqft to price
pittsburgh_cleaner_df.plot(x='price', y='number of amenities', kind='scatter')

In [None]:
#Save data into csv file
pittsburgh_cleaner_df.to_csv('../First Segment Rubric - Clean/csv/pittsburgh_clean_data.csv',index= False)

In [None]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/capstone_database"

In [None]:
engine = create_engine(db_string)

In [None]:
pittsburgh_cleaner_df.to_sql(name='pittsburgh database', con=engine)