In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
df_o = pd.read_csv(Path('Resources/joined_df.csv'), index_col=0)

df_o.head()

Unnamed: 0,year_month,postal_code,city,state_,county,latitude,longitude,median_listing_price,median_listing_price_mm,median_listing_price_yy,...,average_listing_price,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy,quality_flag
0,2022-06-01,33572,Apollo Beach,FL,HILLSBOROUGH,27.84579,-82.331983,731950.0,0.1665,0.5577,...,861186.0,0.0181,0.2696,280.0,0.1382,0.3146,1.9785,-0.7336,-1.5534,1.0
1,2022-06-01,28445,Holly Ridge,NC,ONSLOW,34.527983,-77.514707,687450.0,0.1325,0.4761,...,752644.0,0.0795,0.1772,317.0,0.0893,0.0063,3.1429,-1.0714,-1.0905,1.0
2,2022-06-01,78634,Hutto,TX,WILLIAMSON,30.551885,-97.554189,472500.0,0.0071,0.2459,...,529484.0,-0.0249,0.2655,182.0,0.4677,0.3,0.3358,-0.2465,-0.582,1.0
3,2022-06-01,20872,Damascus,MD,MONTGOMERY,39.285477,-77.215203,731995.0,0.0028,0.2642,...,839165.0,-0.0137,0.191,36.0,0.0909,-0.2,1.25,0.3676,-0.1944,1.0
4,2022-06-01,87121,Albuquerque,NM,BERNALILLO,35.059663,-106.805535,271000.0,0.0131,0.2047,...,308163.0,0.0424,0.2202,159.0,-0.0305,-0.0185,3.5429,0.3634,-1.0089,1.0


In [3]:
df_o.dtypes

year_month                                  object
postal_code                                  int64
city                                        object
state_                                      object
county                                      object
latitude                                   float64
longitude                                  float64
median_listing_price                       float64
median_listing_price_mm                    float64
median_listing_price_yy                    float64
active_listing_count                       float64
active_listing_count_mm                    float64
active_listing_count_yy                    float64
median_days_on_market                      float64
median_days_on_market_mm                   float64
median_days_on_market_yy                   float64
new_listing_count                          float64
new_listing_count_mm                       float64
new_listing_count_yy                       float64
price_increased_count          

In [4]:
target = ['median_listing_price']

df = df_o.drop(columns=['year_month','postal_code','state_','county','city'])

In [5]:
df.columns

Index(['latitude', 'longitude', 'median_listing_price',
       'median_listing_price_mm', 'median_listing_price_yy',
       'active_listing_count', 'active_listing_count_mm',
       'active_listing_count_yy', 'median_days_on_market',
       'median_days_on_market_mm', 'median_days_on_market_yy',
       'new_listing_count', 'new_listing_count_mm', 'new_listing_count_yy',
       'price_increased_count', 'price_increased_count_mm',
       'price_increased_count_yy', 'price_reduced_count',
       'price_reduced_count_mm', 'price_reduced_count_yy',
       'pending_listing_count', 'pending_listing_count_mm',
       'pending_listing_count_yy', 'median_listing_price_per_square_foot',
       'median_listing_price_per_square_foot_mm',
       'median_listing_price_per_square_foot_yy', 'median_square_feet',
       'median_square_feet_mm', 'median_square_feet_yy',
       'average_listing_price', 'average_listing_price_mm',
       'average_listing_price_yy', 'total_listing_count',
       'total_list

In [6]:
# Create our features

X = df.drop(columns=target)

X = pd.get_dummies(X)

# Create our target
y = df.loc[:, target].copy()

In [7]:
print(X)

        latitude   longitude  median_listing_price_mm  \
0      27.845790  -82.331983                   0.1665   
1      34.527983  -77.514707                   0.1325   
2      30.551885  -97.554189                   0.0071   
3      39.285477  -77.215203                   0.0028   
4      35.059663 -106.805535                   0.0131   
...          ...         ...                      ...   
98152  40.073328  -85.999521                  -0.0623   
98153  38.983971  -76.623121                  -0.0493   
98154  39.695488 -119.811146                   0.0543   
98155  30.607780  -81.682889                   0.0000   
98156  28.239369  -82.736882                  -0.0490   

       median_listing_price_yy  active_listing_count  active_listing_count_mm  \
0                       0.5577                  93.0                   0.4091   
1                       0.4761                  77.0                   0.3750   
2                       0.2459                 134.0                   0

In [8]:
X.describe()

Unnamed: 0,latitude,longitude,median_listing_price_mm,median_listing_price_yy,active_listing_count,active_listing_count_mm,active_listing_count_yy,median_days_on_market,median_days_on_market_mm,median_days_on_market_yy,...,average_listing_price,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy,quality_flag
count,98157.0,98157.0,98157.0,98157.0,98157.0,98157.0,98157.0,98157.0,98157.0,98157.0,...,98157.0,98157.0,98157.0,98157.0,98157.0,98157.0,98157.0,98157.0,98157.0,98157.0
mean,35.179574,-94.616447,0.007922,0.088806,143.915798,0.008284,-0.044264,51.726887,0.024181,-0.028585,...,523826.4,0.00846,0.090491,239.341382,0.003359,-0.00907,1.052195,0.005867,0.21181,0.217325
std,5.468655,16.503183,0.055198,0.153244,151.022943,0.161081,0.427853,24.139015,0.235153,0.359101,...,503728.3,0.126059,0.203561,191.653064,0.113743,0.298713,1.07461,0.454665,0.976657,0.412428
min,19.390023,-168.021815,-0.5864,-0.6653,3.0,-0.848,-0.9712,2.0,-0.9167,-0.95,...,39227.0,-0.9711,-0.9287,8.0,-0.7071,-0.8245,0.0,-11.8571,-24.9744,0.0
25%,30.551885,-111.792658,-0.0165,0.0018,55.0,-0.0769,-0.2921,36.0,-0.1078,-0.2308,...,295411.0,-0.0174,-0.0024,117.0,-0.058,-0.1844,0.4233,-0.0914,-0.106,0.0
50%,34.127458,-88.14,0.0024,0.0653,100.0,0.0,-0.0737,47.0,0.0208,-0.0641,...,404001.0,0.0042,0.0639,186.0,-0.0038,-0.0456,0.72,-0.0017,0.0435,0.0
75%,39.336058,-81.502963,0.028,0.147,178.0,0.0746,0.1333,63.0,0.1441,0.1136,...,575831.0,0.0285,0.1504,299.0,0.0553,0.1088,1.3333,0.0915,0.3549,0.0
max,48.935767,111.96,1.0114,4.5284,2658.0,3.0,22.0,543.0,4.25,13.6667,...,19430630.0,32.0937,31.6237,2878.0,2.9359,8.5926,89.6667,56.9167,87.8095,1.0


In [9]:
x_train = df.drop(['median_listing_price'], axis=1)
y_train = df['median_listing_price']

In [10]:
df2 = pd.read_csv(Path('Resources/NC_Hotness.csv'), index_col=0)

df2.head()

Unnamed: 0,date,County,county_fips,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count
57,2022-06-01,cherokee,37039,362500.0,271.0,38.0,148.0,8.0,76.0,1904.0,407510.0,467.0
144,2022-06-01,craven,37049,313500.0,154.0,36.0,196.0,12.0,48.0,1927.0,396716.0,581.0
161,2022-06-01,catawba,37035,342450.0,207.0,34.0,224.0,0.0,96.0,1923.0,521108.0,508.0
240,2022-06-01,gaston,37071,330000.0,283.0,30.0,412.0,12.0,200.0,1668.0,377406.0,828.0
280,2022-06-01,warren,37185,275000.0,19.0,68.0,12.0,0.0,4.0,1824.0,419452.0,29.0


In [11]:
df2.shape

(7196, 12)

In [12]:
import xgboost
classifier=xgboost.XGBRegressor()
classifier.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [13]:
from sklearn.ensemble import RandomForestRegressor


In [14]:
y_pred=classifier.predict(x_train)

In [15]:
#pred=pd.DataFrame(y_pred)
pd.DataFrame({"Prediction": y_pred, "Actual": y_train}).head(20)

Unnamed: 0,Prediction,Actual
0,692625.4,731950.0
1,706893.0,687450.0
2,484651.5,472500.0
3,770360.0,731995.0
4,283116.3,271000.0
5,515361.1,507450.0
6,525447.0,512000.0
7,249585.9,249950.0
8,402616.0,400000.0
9,3475735.0,3445000.0


In [16]:
X = df.drop(['median_listing_price'], axis=1)
y = df['median_listing_price']

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=1)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

MemoryError: Unable to allocate 12.3 GiB for an array with shape (73617, 22411) and data type float64

In [19]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.5,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head(20)

MemoryError: Unable to allocate 12.3 GiB for an array with shape (73617, 22411) and data type float64

In [20]:
# Undersample the data using `RandomUnderSampler`
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({26950.0: 1,
         27000.0: 1,
         29900.0: 1,
         32000.0: 1,
         34500.0: 1,
         34821.0: 1,
         35000.0: 1,
         36014.0: 1,
         36064.0: 1,
         36243.0: 1,
         36850.0: 1,
         37750.0: 1,
         37900.0: 1,
         38500.0: 1,
         38900.0: 1,
         39000.0: 1,
         39050.0: 1,
         39450.0: 1,
         39900.0: 1,
         39907.0: 1,
         40000.0: 1,
         40950.0: 1,
         41714.0: 1,
         42000.0: 1,
         42900.0: 1,
         42950.0: 1,
         43250.0: 1,
         43900.0: 1,
         43950.0: 1,
         44400.0: 1,
         44428.0: 1,
         44450.0: 1,
         44700.0: 1,
         44900.0: 1,
         44950.0: 1,
         45000.0: 1,
         45250.0: 1,
         45900.0: 1,
         46000.0: 1,
         46250.0: 1,
         46450.0: 1,
         46500.0: 1,
         47000.0: 1,
         47250.0: 1,
         47450.0: 1,
         47698.0: 1,
         47700.0: 1,
         4800

In [22]:
# Fit a Logistic regression model using random undersampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

KeyboardInterrupt: 

In [23]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [None]:
# Calculate the Balanced Accuracy Score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [24]:
increase_or_decrease = []
for i in df_o['median_listing_price_mm']:
    if i > 0:
        increase_or_decrease.append(1)
    else:
        increase_or_decrease.append(0)

In [25]:
binary_df = df_o.copy()
binary_df['increase_or_decrease'] = increase_or_decrease
binary_df

Unnamed: 0,year_month,postal_code,city,state_,county,latitude,longitude,median_listing_price,median_listing_price_mm,median_listing_price_yy,...,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy,quality_flag,increase_or_decrease
0,2022-06-01,33572,Apollo Beach,FL,HILLSBOROUGH,27.845790,-82.331983,731950.0,0.1665,0.5577,...,0.0181,0.2696,280.0,0.1382,0.3146,1.9785,-0.7336,-1.5534,1.0,1
1,2022-06-01,28445,Holly Ridge,NC,ONSLOW,34.527983,-77.514707,687450.0,0.1325,0.4761,...,0.0795,0.1772,317.0,0.0893,0.0063,3.1429,-1.0714,-1.0905,1.0,1
2,2022-06-01,78634,Hutto,TX,WILLIAMSON,30.551885,-97.554189,472500.0,0.0071,0.2459,...,-0.0249,0.2655,182.0,0.4677,0.3000,0.3358,-0.2465,-0.5820,1.0,1
3,2022-06-01,20872,Damascus,MD,MONTGOMERY,39.285477,-77.215203,731995.0,0.0028,0.2642,...,-0.0137,0.1910,36.0,0.0909,-0.2000,1.2500,0.3676,-0.1944,1.0,1
4,2022-06-01,87121,Albuquerque,NM,BERNALILLO,35.059663,-106.805535,271000.0,0.0131,0.2047,...,0.0424,0.2202,159.0,-0.0305,-0.0185,3.5429,0.3634,-1.0089,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98152,2017-07-01,46060,Noblesville,IN,HAMILTON,40.073328,-85.999521,255975.0,-0.0623,0.0897,...,-0.0694,0.0532,260.0,0.0788,0.0442,0.8276,-0.1160,0.0562,0.0,0
98153,2017-07-01,21144,Severn,MD,ANNE ARUNDEL,38.983971,-76.623121,414990.0,-0.0493,-0.0236,...,-0.0260,-0.0125,151.0,0.0134,-0.2011,0.2258,0.0027,0.0233,0.0,0
98154,2017-07-01,89506,Reno,NV,WASHOE,39.695488,-119.811146,294900.0,0.0543,0.1430,...,0.0661,0.1502,204.0,-0.0192,-0.0852,2.4483,0.2968,1.2326,0.0,1
98155,2017-07-01,32034,Fernandina Beach,FL,NASSAU,30.607780,-81.682889,429990.0,0.0000,-0.0205,...,0.0021,0.0518,890.0,-0.0252,0.2276,0.5961,0.0229,0.4670,0.0,0


In [26]:
D = pd.get_dummies(binary_df, columns=['year_month'])
D

Unnamed: 0,postal_code,city,state_,county,latitude,longitude,median_listing_price,median_listing_price_mm,median_listing_price_yy,active_listing_count,...,year_month_2021-09-01,year_month_2021-10-01,year_month_2021-11-01,year_month_2021-12-01,year_month_2022-01-01,year_month_2022-02-01,year_month_2022-03-01,year_month_2022-04-01,year_month_2022-05-01,year_month_2022-06-01
0,33572,Apollo Beach,FL,HILLSBOROUGH,27.845790,-82.331983,731950.0,0.1665,0.5577,93.0,...,0,0,0,0,0,0,0,0,0,1
1,28445,Holly Ridge,NC,ONSLOW,34.527983,-77.514707,687450.0,0.1325,0.4761,77.0,...,0,0,0,0,0,0,0,0,0,1
2,78634,Hutto,TX,WILLIAMSON,30.551885,-97.554189,472500.0,0.0071,0.2459,134.0,...,0,0,0,0,0,0,0,0,0,1
3,20872,Damascus,MD,MONTGOMERY,39.285477,-77.215203,731995.0,0.0028,0.2642,16.0,...,0,0,0,0,0,0,0,0,0,1
4,87121,Albuquerque,NM,BERNALILLO,35.059663,-106.805535,271000.0,0.0131,0.2047,35.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98152,46060,Noblesville,IN,HAMILTON,40.073328,-85.999521,255975.0,-0.0623,0.0897,145.0,...,0,0,0,0,0,0,0,0,0,0
98153,21144,Severn,MD,ANNE ARUNDEL,38.983971,-76.623121,414990.0,-0.0493,-0.0236,124.0,...,0,0,0,0,0,0,0,0,0,0
98154,89506,Reno,NV,WASHOE,39.695488,-119.811146,294900.0,0.0543,0.1430,58.0,...,0,0,0,0,0,0,0,0,0,0
98155,32034,Fernandina Beach,FL,NASSAU,30.607780,-81.682889,429990.0,0.0000,-0.0205,557.0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Creating the scaler instance
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

In [28]:
clean = D.drop(columns=['state_','city', 'county'])

In [29]:
# Fitting the scaler
loans_data_scaled = data_scaler.fit_transform(clean)
loans_data_scaled[:5]

array([[-0.83934121, -1.34106489,  0.74437314,  0.99748267,  2.87290515,
         3.05981242, -0.33714121,  2.48830747,  2.39093549, -1.14863959,
        -0.18958453, -0.73938958,  0.7753844 , -0.01133473,  0.26939862,
        -0.18352227,  1.03726466,  0.66671657,  0.31019194,  1.1623398 ,
         2.04081557,  1.17675138, -0.01318549, -0.08752903,  0.35809643,
         0.21236447,  1.64796035,  0.61030977, -0.16247747,  0.39956384,
         0.66972878,  0.07647441,  0.87988354,  0.21214806,  1.18549921,
         1.08355345,  0.86199607, -1.62640759, -1.80740883,  1.89773469,
         0.93110392, -0.17022007, -0.16216035, -0.1528186 , -0.15865214,
        -0.1359012 , -0.1372172 , -0.1359012 , -0.13211987, -0.14311163,
        -0.17078955, -0.16393828, -0.16673471, -0.1830992 , -0.16321603,
        -0.17204907, -0.16579644, -0.14731759, -0.14546611, -0.13362854,
        -0.12933859, -0.15007334, -0.15215404, -0.12491137, -0.12402511,
        -0.12637551, -0.11218509, -0.13852162, -0.1

In [30]:
print(np.mean(loans_data_scaled[:,0]))
print(np.std(loans_data_scaled[:,0]))

-5.791071330705707e-18
1.0


In [31]:
# Define features set of NC
X = clean.copy()
X = X.drop('increase_or_decrease', axis=1)
X.head()

Unnamed: 0,postal_code,latitude,longitude,median_listing_price,median_listing_price_mm,median_listing_price_yy,active_listing_count,active_listing_count_mm,active_listing_count_yy,median_days_on_market,...,year_month_2021-09-01,year_month_2021-10-01,year_month_2021-11-01,year_month_2021-12-01,year_month_2022-01-01,year_month_2022-02-01,year_month_2022-03-01,year_month_2022-04-01,year_month_2022-05-01,year_month_2022-06-01
0,33572,27.84579,-82.331983,731950.0,0.1665,0.5577,93.0,0.4091,0.9787,24.0,...,0,0,0,0,0,0,0,0,0,1
1,28445,34.527983,-77.514707,687450.0,0.1325,0.4761,77.0,0.375,0.2833,33.0,...,0,0,0,0,0,0,0,0,0,1
2,78634,30.551885,-97.554189,472500.0,0.0071,0.2459,134.0,0.6962,0.8356,16.0,...,0,0,0,0,0,0,0,0,0,1
3,20872,39.285477,-77.215203,731995.0,0.0028,0.2642,16.0,-0.0588,-0.1111,53.0,...,0,0,0,0,0,0,0,0,0,1
4,87121,35.059663,-106.805535,271000.0,0.0131,0.2047,35.0,-0.1026,0.2069,36.0,...,0,0,0,0,0,0,0,0,0,1


In [32]:
# Define target vector
y = clean['increase_or_decrease'].values.reshape(-1, 1)
y[:5]

array([[1],
       [1],
       [1],
       [1],
       [1]], dtype=int64)

In [33]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=560)

In [34]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(73617, 100)
(24540, 100)
(73617, 1)
(24540, 1)


In [35]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [36]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [37]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [38]:
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [39]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [40]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)
predictions

array([1, 1, 0, ..., 0, 1, 0], dtype=int64)

In [41]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [42]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11336,0
Actual 1,0,13204


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11336
           1       1.00      1.00      1.00     13204

    accuracy                           1.00     24540
   macro avg       1.00      1.00      1.00     24540
weighted avg       1.00      1.00      1.00     24540



In [43]:
# Use the train_test_split function to create training and testing subsets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(73617, 100)

In [44]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

(73617, 100)

In [45]:
# Instantiate a linear SVM model
from sklearn.svm import SVC
model = SVC(kernel='linear')

In [None]:
# Fit the data
model.fit(X_train, y_train)

In [None]:
# Make predictions using the test data
y_pred = model.predict(X_test)
results = pd.DataFrame({
    "Prediction": y_pred, 
    "Actual": y_test
}).reset_index(drop=True)
results.head()