In [45]:
import pandas as pd

In [46]:
housing = pd.read_csv("data2.csv") # retriving the data

In [47]:
housing.head() 

In [4]:
# housing.columns

In [5]:
housing.describe() # displaying the attributes like count, minnimum , maximum, etc.

In [6]:
housing.info() # to make sure that we have all the data and none of it is missing

In [7]:
housing['LSTAT'].value_counts() # counting different attribute values

In [8]:
%matplotlib inline 
# to plot grahy in this notebook itself

In [9]:
import matplotlib.pyplot as ply
housing.hist(bins=50, figsize=(20,15), color='green')

In [10]:
# spliting
# import numpy as np
# np.random.seed(42)
# def split_train_test(data, test_ratio):
#     shuffle = np.random.permutation(len(data))
#     print(shuffle)
#     test_set_size = int(len(data) * test_ratio)
#     test_indices = shuffle[:test_set_size]
#     train_indices = shuffle[test_set_size:]
#     return data.iloc[train_indices], data.iloc[test_indices]

In [11]:
# train_set, test_set = split_train_test(housing, 0.2)

In [12]:
# print(f"Rows in train set : {len(train_set)}")
# print(f"Rows in test set : {len(test_set)}")

In [13]:
# Spliting the data into training and testing sets

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, train_size=0.8, random_state=42)
print(f"Rows in train set : {len(train_set)}")
print(f"Rows in test set : {len(test_set)}")

In [14]:
# Spliting a particular attribute into training and testing sets  

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [15]:
strat_train_set['CHAS'].value_counts()

In [16]:
strat_test_set['CHAS'].value_counts()

In [17]:
housing = strat_train_set.copy()

In [18]:
# Ploting graphs to find the relationship between the attributes

from pandas.plotting import scatter_matrix
attributes = ["MEDV", "ZN", "RM", "LSTAT"]
scatter_matrix(housing[attributes], figsize=(12,8), alpha=0.8)

In [19]:
housing.plot(kind="scatter", x="RM", y="MEDV")

In [20]:
# Adding new attribute (TAXRM)

housing['TAXRM'] = housing['TAX']/housing['RM']

In [21]:
housing['TAXRM'].info()

In [22]:
housing.head()

In [23]:
housing.describe()

In [24]:
corr_matrix = housing.corr() # finding the correlation between different attributes

In [25]:
corr_matrix

In [26]:
corr_matrix['MEDV'].sort_values(ascending=False) # finding the correlation with respect to MEDV

In [27]:
housing.plot(kind="scatter", x="TAXRM", y="MEDV", alpha=0.8) 

In [28]:
housing = strat_train_set.drop("MEDV", axis=1)

In [29]:
housing_lables = strat_train_set["MEDV"].copy()

In [30]:
housing

In [31]:
# dropping a column in the data
# 1. dropping the missing data 

a = housing.dropna(subset=["RM"]) 
a.shape
# a.describe()

In [32]:
# 2. dropping the entire column 

housing.drop("RM", axis=1).shape

In [33]:
# 3. filling the missing data points with some value (i.e., 0 or median or mean) 

median = housing["RM"].median() 
housing["RM"].fillna(median) 

In [34]:
housing.shape

In [35]:
# before we start filling missing attributes

housing.describe() 

In [36]:
# filling the missing values with median

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)

In [37]:
imputer.statistics_.shape

In [38]:
X = imputer.transform(housing)

In [39]:
housing_tr = pd.DataFrame(X, columns=housing.columns)

In [40]:
housing_tr.describe()

In [41]:
# Creating a pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler()),
])

In [42]:
housing_num_tr = my_pipeline.fit_transform(housing)

In [43]:
housing_num_tr.shape

In [44]:
# Trying different models :
# 1. Linear Regression
# 2. Decision Tree Regression
# 3. Random Forest Regression

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)

In [None]:
some_data = housing.iloc[:5]

In [None]:
some_labels = housing_labels.iloc[:5]

In [None]:
prepared_data = my_pipeline.transform(some_data)

In [None]:
model.predict(prepared_data)

In [None]:
list(some_labels)

In [None]:
# Evaluating the model

from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)

In [None]:
rmse

In [None]:
# Cross Validation

from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [None]:
rmse_scores

In [None]:
def print_scores(scores):
    print("Scores : ", scores)
    print("Mean : ", scores.mean())
    print("Standard Deviation : ", scores.std())

In [None]:
print_scores(rmse_scores)

In [None]:
# saving the model

from joblib import dump,load
dump(model, 'Housing.joblib')

In [None]:
# Testing 

X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
# print(final_predictions,list(Y_test))

In [None]:
final_rmse

In [None]:
prepared_data[0]

In [None]:
# Using the model

from joblib import dump, load
import numpy as np
model = load('Dragon.joblib')
features = np.array([-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.24322309, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034])
model.predict([features])