# Project 3
## CS 7324
#### Jennifer Carballo & Amory Weinzierl

In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#### load in datasets

In [None]:
# load in real estate dataset

real_estate_df = pd.read_csv("data/WakeCountyHousing.csv")

# display df
real_estate_df.head()

In [None]:
# load in reddit upvote dataset - test and train

reddit_upvotes_train_df = pd.read_csv("data/train_NIR5Yl1.csv")
reddit_upvotes_test_df = pd.read_csv("data/test_8i3B3FC.csv")

reddit_upvotes_train_df.head()

In [None]:
reddit_upvotes_test_df.head()

In [None]:
# load in uber fare dataset

uber_fares_df = pd.read_csv("data/uber.csv")
uber_fares_df = uber_fares_df.rename(columns={'Unnamed: 0': 'index'})
uber_fares_df = uber_fares_df.set_index("index")

uber_fares_df.head()

#### explore data

##### explore real estate data

In [None]:
real_estate_df.info()

In [None]:
real_estate_df.describe()

In [None]:
real_estate_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
real_estate_df.isna().sum().sum()

In [None]:
for columnName in real_estate_df:
    print(columnName, real_estate_df[columnName].dtypes)

In [None]:
for columnName in real_estate_df:
    print(columnName, real_estate_df[columnName].isna().sum())

In [None]:
real_estate_df['Bath'].value_counts()

In [None]:
real_estate_df[real_estate_df['Bath'].isna()]

In [None]:
real_estate_df[real_estate_df['Design_Style'] == "Conventional"]["Bath"].value_counts()

In [None]:
real_estate_df[real_estate_df['Design_Style'] == "Condo"]["Bath"].value_counts()

##### explore reddit upvote training data

In [None]:
reddit_upvotes_train_df.info()

In [None]:
reddit_upvotes_train_df.describe()

In [None]:
reddit_upvotes_train_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
reddit_upvotes_train_df.isna().sum().sum()

In [None]:
for columnName in reddit_upvotes_train_df:
    print(columnName, reddit_upvotes_train_df[columnName].dtypes)

##### explore uber fares data

In [None]:
uber_fares_df.info()

In [None]:
uber_fares_df.describe()

In [None]:
uber_fares_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
uber_fares_df.isna().sum().sum()

In [None]:
for columnName in uber_fares_df:
    print(columnName, uber_fares_df[columnName].isna().sum())

In [None]:
# since only two datapoints in entire dataframe are null, simply remove rows where null data is
uber_fares_df = uber_fares_df.dropna()

In [None]:
uber_fares_df.isna().sum().sum()

In [None]:
for columnName in uber_fares_df:
    print(columnName, uber_fares_df[columnName].dtypes)

#### executing tasks on real estate df

## Executing tasks on reddit train df

### Preparing Data:

In [None]:
# checking attribute correlation to upvotes
corr_matrix = reddit_upvotes_train_df.corr()
corr_matrix["Upvotes"].sort_values(ascending=False)

In [None]:
# checking combo attributes
upvotes_copy = reddit_upvotes_train_df.copy()

upvotes_copy["rep_per_view"] = upvotes_copy["Reputation"] / upvotes_copy["Views"]
upvotes_copy["view_per_rep"] = upvotes_copy["Views"] / upvotes_copy["Reputation"]
upvotes_copy["ans_per_view"] = upvotes_copy["Answers"] / upvotes_copy["Views"]

corr_matrix = upvotes_copy.corr()
corr_matrix["Upvotes"].sort_values(ascending=False)

In [None]:
# looking at scatter plot for upvotes per views
reddit_upvotes_train_df.plot(kind="scatter", x="Views", y="Upvotes", alpha=0.1)

In [None]:
# remove categorical attribute tag from df
train_numerical = reddit_upvotes_train_df.copy().drop("Tag", axis=1)
test_numerical = reddit_upvotes_test_df.copy().drop("Tag", axis=1)
train_numerical

### Scaling Data

In [None]:
from sklearn import preprocessing

scalar = preprocessing.StandardScaler().fit(train_numerical)
X = scalar.transform(train_numerical)
prepared_train = pd.DataFrame(X, columns=train_numerical.columns,
                          index=train_numerical.index)

train_labels = prepared_train["Upvotes"].copy()
prepared_train = prepared_train.copy().drop("Upvotes", axis = 1)
train_labels

In [None]:
# Train using forest regression
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10,
                             max_depth=10,
                             criterion='squared_error',
                            )
forest_reg.fit(prepared_train, train_labels)

In [None]:
# trying it on training data
small_data = prepared_train.iloc[:5]
small_labels = train_labels.iloc[:5]
print("Predictions:", forest_reg.predict(small_data))

In [None]:
print("Labels:", list(small_labels))

### Task #1: K-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

x_train = prepared_train[:10000]
y_train = train_labels[:10000]
    
reg_scores = cross_val_score(forest_reg, x_train, y_train, scoring="neg_mean_squared_error", cv=10)
reg_rmse_scores = np.sqrt(-reg_scores)

display_scores(reg_rmse_scores)

### Task #2: StratifiedK-Fold Cross Validation

In [None]:
# for classifier I will be using the "tag" attribute as the target (also I reduced the data set size to 1000 because my laptop is weak)
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

y_train_cat = reddit_upvotes_train_df["Tag"][:10000]

rnd_clf = RandomForestClassifier()
rnd_clf.fit(x_train, y_train_cat)

cnt = 1
skf = StratifiedKFold(n_splits=3, shuffle=False)
for train_index, test_index in skf.split(x_train, y_train_cat):
    clone_clf = clone(rnd_clf)
    x_train_folds = x_train.iloc[train_index]
    y_train_folds = y_train_cat[train_index]
    
    clone_clf.fit(x_train_folds, y_train_folds)
    y_pred = clone_clf.predict(x_train_folds)
    print("Accuracy for Fold", cnt, ":", accuracy_score(y_train_folds, y_pred))
    cnt += 1

### Task #3: Use sklearn.mean_squared_error and One Other Option to Evaluate Model Performance

#### Mean Squared Error

In [None]:
from sklearn.metrics import mean_squared_error

predictions = forest_reg.predict(x_train)
reg_mse = mean_squared_error(y_train, predictions)
print("MSE:", reg_mse)
reg_rmse = np.sqrt(reg_mse)
print("RMSE:", reg_rmse)

#### R^2 Score

In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_train, predictions)
r2

### Task #4: Generate a Confusion Matrix

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

y_pred = cross_val_predict(rnd_clf, x_train, y_train_cat, cv=3)
confusion_matrix(y_train_cat, y_pred)

### Task #5: Generate ROC Curve

In [None]:
# logistic regression for multi-class classification using a one-vs-rest (must convert into binary classification to do this task)
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model = LogisticRegression()
ovr = OneVsRestClassifier(model)
ovr.fit(x_train, y_train_cat)
yhat = ovr.predict(x_train)
yhat

In [None]:
from sklearn.metrics import roc_curve

y_scores = cross_val_predict(ovr, x_train, yhat, cv=3,
                             method="decision_function")
fpr, tpr, thresholds = roc_curve(yhat, y_scores, pos_label={'a', 'c', 'r', 'j', 'p', 's', 'h', 'o', 'i', 'x'})

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal
    
plot_roc_curve(fpr, tpr)
plt.show()

### Task #6: Use Grid Search CV to Tune Hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

new_forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(new_forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(x_train, y_train)

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
grid_search.best_estimator_

### Task #7: Use an Ensemble of Methods

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

small_x = x_train[:1000]
small_y = y_train_cat[:1000]

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('sv', svm_clf)],
    voting='hard')
voting_clf.fit(small_x, small_y)

In [None]:
y_pred = voting_clf.predict(small_x)
print("Ensemble Accuracy: ", accuracy_score(small_y, y_pred))

### Task #8: Evaluate Your System on the Test Data

In [None]:
final_model = grid_search.best_estimator_

x_test = prepared_train[1000:2000]
y_test = train_labels[1000:2000]

final_model.fit(x_test, y_test)
final_predictions = final_model.predict(x_test)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

### Task #9: Create a Single Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

numeric_features = ['ID', 'Reputation', 'Answers', 'Username', 'Views']
categorical_features = ['Tag']

numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', RandomForestRegressor(bootstrap=False, max_features=4, n_estimators=10)),
           ])

y_upvotes = reddit_upvotes_train_df["Upvotes"]
x_upvotes = reddit_upvotes_train_df.copy().drop("Upvotes", axis=1)
x_test = x_upvotes[1000:2000]
y_test = y_upvotes[1000:2000]

model = pipeline.fit(x_upvotes, y_upvotes)
final_predictions = model.predict(x_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

#### executing tasks on uber fares df