# Assignment 1 (new working doc)

## Links
- https://cbscanvas.instructure.com/courses/41482/files/1504373?wrap=1
- https://insideairbnb.com/get-the-data/

## Approach
1. data cleaning & standardization
### EDA
2. correlation analysis
3. graphing
### Clustering
4. choose clustering algorithm
5. run clustering algorithm
6. graph clusters
### PCA
7. run pca

In [52]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [53]:
# data
with open('./data/listings.csv', 'r') as file:
    df_listings = pd.read_csv(file)
    
df_listings.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,371299,Marais Rue des Archives refait à neuf février ...,1870265,Thomas,,Hôtel-de-Ville,48.85751,2.35511,Entire home/apt,185.0,3,601,2024-09-06,3.97,3,307,54,7510300547558
1,371862,loft in Paris (Belleville),1872631,Veronique,,Entrepôt,48.87151,2.37219,Entire home/apt,250.0,4,50,2023-08-14,0.34,1,9,0,7511000320406
2,372879,Appartement complet au centre de Paris.,1876396,Samuel,,Gobelins,48.83593,2.35108,Entire home/apt,85.0,30,171,2024-08-15,2.01,3,151,1,"Available with a mobility lease only (""bail mo..."
3,375434,"Luxurious Art & Design Flat, 180m2, Champs Ely...",683140,Oscar,,Élysée,48.8668,2.30972,Entire home/apt,,3,22,2023-07-15,0.15,1,0,0,7510806561916
4,378897,Little flat near Montmartre,1902818,Lorraine,,Opéra,48.88285,2.33462,Entire home/apt,110.0,3,28,2024-07-30,0.19,1,6,2,7511805895013


## 1. data cleaning & standardization
- after this step we will have two dataframes one with only numeric columns and the other with only both
- we standardize the data with z-score normalization

In [54]:
## drop na prices
df_listings_clean = df_listings.dropna(subset=['price'])

## drop irrelevant columns
drop_cols_listing = ['id', 'host_id', 'neighbourhood_group', 'license', 'last_review']
df_listings_clean = df_listings_clean.drop(columns=drop_cols_listing)

## non numeric columns
def drop_non_numeric(df):
    return df.select_dtypes(include=['number'])

df_listings_num = drop_non_numeric(df_listings_clean)

df_listings_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64230 entries, 0 to 95460
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   name                            64230 non-null  object 
 1   host_name                       64230 non-null  object 
 2   neighbourhood                   64230 non-null  object 
 3   latitude                        64230 non-null  float64
 4   longitude                       64230 non-null  float64
 5   room_type                       64230 non-null  object 
 6   price                           64230 non-null  float64
 7   minimum_nights                  64230 non-null  int64  
 8   number_of_reviews               64230 non-null  int64  
 9   reviews_per_month               46054 non-null  float64
 10  calculated_host_listings_count  64230 non-null  int64  
 11  availability_365                64230 non-null  int64  
 12  number_of_reviews_ltm           64230

In [55]:
df_listings_clean.isna().sum()

name                                  0
host_name                             0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 18176
calculated_host_listings_count        0
availability_365                      0
number_of_reviews_ltm                 0
dtype: int64

In [56]:
# we make the assumption that if revievews_per_month is nan, it means that there are no reviews and we can replace it with 0
df_listings_clean['reviews_per_month'] = df_listings_clean['reviews_per_month'].fillna(0)

In [59]:
# standardize the data
def z_score_normalize(col):
    col_mean = np.mean(col)
    col_variance = np.std(col)
    return((col-col_mean)/col_variance)

df_listings_norm = pd.DataFrame()
for col in df_listings_num.columns:
    if col == 'price':
        df_listings_norm[col] = df_listings_num[col]
    else:
        df_listings_norm[col] = z_score_normalize(df_listings_num[col])
    
df_listings_norm.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm
0,-0.346187,0.399319,185.0,-0.188825,9.415228,1.836377,-0.260815,1.026043,3.504188
1,0.431265,0.893292,250.0,-0.164072,0.404461,-0.707936,-0.279409,-1.44996,-0.549648
2,-1.544573,0.282767,85.0,0.4795,2.383232,0.462588,-0.260815,-0.270119,-0.474577
4,1.061,-0.193275,110.0,-0.188825,0.044684,-0.813073,-0.279409,-1.474886,-0.399506
5,0.325198,0.34032,300.0,0.058703,-0.315092,-0.91821,-0.279409,1.491333,-0.324435


## 2. Correlation analysis

In [None]:
## Convert column to list
def calc_mean(col):
    ## Calculate mean
    sum_col = sum(col)
    count_col = len(col)
    mean_col = round(col / count_col,6)
    

for col in df_listings_num.columns:
    col_list = df_listings_num.loc[:, col].dropna().tolist()

        
## Calculate variance
    squared_dist = sum([(mean_col-x)**2 for x in col_list])
    var_col = squared_dist / count_col

## Calculate standard deviation  
    sd_col = var_col ** (1/2)

## print results
    print(f"Mean of {col} is {mean_col}")
    print(f"Variance of {col} is {var_col}")
    print(f"Standard Deviation of {col} is {sd_col}")
    

# Loop through each pair of columns
for col1 in numeric_df.columns:
    for col2 in numeric_df.columns:
        # Convert first column to list
        x_list = numeric_df[col1].to_list()
        # Calculate mean of first column
        x_mean = np.mean(x_list)
        # Convert second column to list
        y_list = numeric_df[col2].to_list()
        # Calculate mean of second column
        y_mean = np.mean(y_list)
        # Calculate covariance
        sum_cross_variations = sum([(x - x_mean) * (y_list[i] - y_mean) for i, x in enumerate(x_list)])
        cov_xy = sum_cross_variations / len(x_list)
        
        # Calculate std for correlation
        std_x = np.std(x_list)
        std_y = np.std(y_list)
        
        # Calculate correlation
        corr_xy = cov_xy / (std_x * std_y)
        
        print(f"Correlation between {col1} and {col2} is {corr_xy}")