# Project 3
## CS 7324
#### Jennifer Carballo & Amory Weinzierl

In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#### load in datasets

In [None]:
# load in real estate dataset

real_estate_df = pd.read_csv("data/WakeCountyHousing.csv")

# display df
real_estate_df.head()

In [None]:
# load in reddit upvote dataset - test and train

reddit_upvotes_train_df = pd.read_csv("data/train_NIR5Yl1.csv")
reddit_upvotes_test_df = pd.read_csv("data/test_8i3B3FC.csv")

reddit_upvotes_train_df.head()

In [None]:
reddit_upvotes_test_df.head()

In [None]:
# load in uber fare dataset

uber_fares_df = pd.read_csv("data/uber.csv")
uber_fares_df = uber_fares_df.rename(columns={'Unnamed: 0': 'index'})
uber_fares_df = uber_fares_df.set_index("index")

uber_fares_df.head()

#### explore data

##### explore real estate data

In [None]:
real_estate_df.info()

In [None]:
real_estate_df.describe()

In [None]:
real_estate_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
real_estate_df.isna().sum().sum()

In [None]:
for columnName in real_estate_df:
    print(columnName, real_estate_df[columnName].dtypes)

In [None]:
for columnName in real_estate_df:
    print(columnName, real_estate_df[columnName].isna().sum())

##### explore reddit upvote training data

In [None]:
reddit_upvotes_train_df.info()

In [None]:
reddit_upvotes_train_df.describe()

In [None]:
reddit_upvotes_train_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
reddit_upvotes_train_df.isna().sum().sum()

In [None]:
for columnName in reddit_upvotes_train_df:
    print(columnName, reddit_upvotes_train_df[columnName].dtypes)

##### explore uber fares data

In [None]:
uber_fares_df.info()

In [None]:
uber_fares_df.describe()

In [None]:
uber_fares_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
uber_fares_df.isna().sum().sum()

In [None]:
for columnName in uber_fares_df:
    print(columnName, uber_fares_df[columnName].isna().sum())

In [None]:
# since only two datapoints in entire dataframe are null, simply remove rows where null data is
uber_fares_df = uber_fares_df.dropna()

In [None]:
uber_fares_df.isna().sum().sum()

In [None]:
for columnName in uber_fares_df:
    print(columnName, uber_fares_df[columnName].dtypes)

#### executing tasks on real estate df

In [None]:
# replace missing values

for columnName in real_estate_df:
    print(columnName, real_estate_df[columnName].isna().sum())

In [None]:
real_estate_df['Bath'].value_counts()

In [None]:
real_estate_df[real_estate_df['Bath'].isna()]

In [None]:
real_estate_df[(real_estate_df['Bath'].isna()) & (real_estate_df['Design_Style'] == 'Conventional')]

In [None]:
real_estate_df[real_estate_df['Design_Style'] == "Conventional"]["Bath"].value_counts()

In [None]:
real_estate_df[real_estate_df['Design_Style'] == "Conventional"]["Bath"].mode()[0]

In [None]:
real_estate_df.loc[real_estate_df['Design_Style'] == "Conventional", 'Bath'] = real_estate_df['Bath'].fillna(real_estate_df[real_estate_df['Design_Style'] == "Conventional"]["Bath"].mode()[0])

In [None]:
real_estate_df[(real_estate_df['Bath'].isna()) & (real_estate_df['Design_Style'] == 'Conventional')]

In [None]:
real_estate_df[real_estate_df['Design_Style'] == "Condo"]["Bath"].mode()[0]

In [None]:
real_estate_df.loc[real_estate_df['Design_Style'] == "Condo", 'Bath'] = real_estate_df['Bath'].fillna(real_estate_df[real_estate_df['Design_Style'] == "Condo"]["Bath"].mode()[0])

In [None]:
real_estate_df[(real_estate_df['Bath'].isna()) & (real_estate_df['Design_Style'] == 'Condo')]

In [None]:
for columnName in real_estate_df:
    print(columnName, real_estate_df[columnName].isna().sum())

In [None]:
print(len(real_estate_df))

In [None]:
real_estate_df = real_estate_df.dropna()

In [None]:
for columnName in real_estate_df:
    print(columnName, real_estate_df[columnName].isna().sum())

In [None]:
real_estate_df.isna().sum().sum()

In [None]:
# utilize ordinal encoder

real_estate_df['Bath'].value_counts()

In [None]:
real_estate_df['Bath'] = real_estate_df['Bath'].replace('2½ Bath', 4)
real_estate_df['Bath'] = real_estate_df['Bath'].replace('2 Bath', 3)
real_estate_df['Bath'] = real_estate_df['Bath'].replace('3½ Bath', 6)
real_estate_df['Bath'] = real_estate_df['Bath'].replace('3 Bath', 5)
real_estate_df['Bath'] = real_estate_df['Bath'].replace('Other', 0)
real_estate_df['Bath'] = real_estate_df['Bath'].replace('1 Bath', 1)
real_estate_df['Bath'] = real_estate_df['Bath'].replace('1 ½ Bath', 2)

In [None]:
real_estate_df['Bath'].value_counts()

In [None]:
# utilize one hot encoder

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown = 'ignore')

encoder_df = pd.DataFrame(encoder.fit_transform(real_estate_df[['Num_Stories']]).toarray())
                          
final_df = real_estate_df.join(encoder_df)

final_df

In [None]:
real_estate_df.isna().sum().sum()

In [None]:
# implement custom transformer
# https://medium.com/@benlc77/how-to-write-clean-and-scalable-code-with-custom-transformers-sklearn-pipelines-ecb8e53fe110

from sklearn.base import BaseEstimator, TransformerMixin

class DropFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, variables):
        self.variables = variables
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        X_dropped = X.drop(self.variables, axis = 1)
        self.columns = X_dropped.columns
        return X_dropped
    
transformer = DropFeatureSelector('Month_Year_of_Sale')

In [None]:
transformer.fit(real_estate_df)

In [None]:
transformer.transform(real_estate_df)

In [None]:
# scale/normalize/standardize features using sklearn.preprocessing

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_features = scaler.fit_transform(real_estate_df.Heated_Area.values.reshape(-1,1))
scaled_features

In [None]:
real_estate_df['Heated_Area'] = scaled_features

In [None]:
real_estate_df

#### executing tasks on reddit train df

#### executing tasks on uber fares df

In [None]:
# followed along with textbook classification chapter

uber_fares_df

In [None]:
uber_fares_df.drop(columns=['key'], inplace = True)
uber_fares_df

In [None]:
uber_fares_df.drop(columns=['pickup_datetime'], inplace = True)
uber_fares_df

In [None]:
x_data = uber_fares_df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']]
y_data = uber_fares_df[['fare_amount']]

In [None]:
# create train and test sets for uber dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data ,test_size = 0.2, shuffle=False)

y_train_greater10 = (y_train > 10)
y_test_greater10 = (y_test > 10)

In [None]:
# use SGDClassifier
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_greater10)

sgd_y_pred = sgd_clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test_greater10, sgd_y_pred))

In [None]:
# use sklearn.linear_model_LinearRegression
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train_greater10)

print("Score:",lin_reg.score(X_test, y_test_greater10))

In [None]:
# use sklearn.tree.DecisionTreeRegressor

from sklearn.tree import DecisionTreeRegressor 

regressor = DecisionTreeRegressor(random_state = 0) 

regressor.fit(X_train, y_train_greater10)

print("Score:",regressor.score(X_test, y_test_greater10))

In [None]:
# use sklearn.ensemble.RandomForestClassifer

from sklearn.ensemble import RandomForestClassifier

rf_clf=RandomForestClassifier(n_estimators=100)

rf_clf.fit(X_train,y_train_greater10)

rf_y_pred=rf_clf.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test_greater10, rf_y_pred))

In [None]:
# use sklearn.neighbors.KNeighborsClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
import numpy as np

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_multilabel)

y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)

In [None]:
print("Accuracy:",accuracy_score(y_multilabel, y_train_knn_pred))

In [None]:
# use OvO or OvR classifier

from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression

OvO_clf = OneVsOneClassifier(LogisticRegression())
OvO_clf.fit(X_train, y_train_greater10)

y_pred = OvO_clf.predict(X_test)

In [None]:
print("Accuracy:",accuracy_score(y_test_greater10, y_pred))