In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression



In [2]:
df = pd.DataFrame(np.random.randint(0,100,size=(100, 9)), columns=["Price","Bedroom","Space","Room","Lot","Tax","Bathroom","Garage","Condition"])

In [4]:
df.to_csv('data/realest.csv')

In [3]:
df

Unnamed: 0,Price,Bedroom,Space,Room,Lot,Tax,Bathroom,Garage,Condition
0,53,40,90,41,19,37,23,81,88
1,53,62,53,73,2,43,40,48,81
2,55,36,19,53,72,97,56,54,49
3,15,7,2,69,25,11,34,52,42
4,88,71,17,33,65,25,39,79,60
5,72,36,22,82,26,28,25,92,50
6,94,61,58,60,62,30,35,32,84
7,1,98,17,94,83,89,75,25,51
8,93,64,57,52,4,87,58,36,44
9,66,82,66,37,55,15,8,39,28


In [46]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression


class AnalysisDataAndFitLinearRegression:

    def __init__(self):
        self.version = 1

    def analyse_and_fit_lrm(self, path):
        # a path to a dataset is "./data/realest.csv"
        # dataset can be loaded by uncommenting the line bellow
        data = pd.read_csv(path)
        data = self.__listwise_deletion(data)
        summary_dict = self.analyze(data)
        regression_dict = self.fit(data)
        
        return {"summary_dict": summary_dict,
                "regression_dict": regression_dict}

    def analyze(self, data):
        return {"statistics": self.calc_statistics(data),
                "data_frame": self.filter_data(data),
                "number_of_observations": self.calc_noo(data)}

    def fit(self, data):
        cols = list(df.columns)
        dependant = "Price"
        cols.remove(dependant)
        X = df[cols]
        y = df[dependant]
        reg = LinearRegression().fit(X, y)
        model_parameters = dict(zip(["Intercept"]+cols,[reg.intercept_]+reg.coef_))
        price_prediction = reg.predict(np.array([3,1500,8,40,1000,2,1,0]).reshape(1, -1))[0]
        return {"model_parameters": model_parameters,
               "price_prediction": price_prediction}
    
    def calc_statistics(self, data):
        tax = data["Tax"]
        return [tax.mean(), tax.std(), tax.median(), tax.min(), tax.max()]

    def filter_data(self, data):
        return df[df["Space"]>800].sort_values("Price")
    
    def calc_noo(self, data):
        return df[df["Lot"]>=df["Lot"].quantile(.8)].shape[0]
        

    def __listwise_deletion(self, data: pd.DataFrame):
        return data.dropna()


    

In [47]:
adaflr = AnalysisDataAndFitLinearRegression()

In [48]:
adaflr.analyse_and_fit_lrm("data/realest.csv")

{'summary_dict': {'statistics': [51.9, 27.830112315891792, 52.5, 0, 99],
  'data_frame': Empty DataFrame
  Columns: [Price, Bedroom, Space, Room, Lot, Tax, Bathroom, Garage, Condition]
  Index: [],
  'number_of_observations': 21},
 'regression_dict': {'model_parameters': {'Intercept': 46.73581354774256,
   'Bedroom': 46.69971657870504,
   'Space': 46.90900245729413,
   'Room': 46.612538276151476,
   'Lot': 46.85986642904333,
   'Tax': 46.77448879790597,
   'Bathroom': 46.52222248203744,
   'Garage': 46.86521772503418},
  'price_prediction': 106.6480148889227}}

In [11]:
# df[df["Space"]>80].sort_values("Price")

In [15]:
df[df["Lot"]>=df["Lot"].quantile(.8)].shape[0]

21

In [25]:
cols = list(df.columns)

In [26]:
dependant = "Price"

In [27]:
cols.remove(dependant)

In [28]:
cols

['Bedroom', 'Space', 'Room', 'Lot', 'Tax', 'Bathroom', 'Garage', 'Condition']

In [30]:
X = df[cols]

In [31]:
y = df[dependant]

In [33]:
reg = LinearRegression().fit(X, y)

In [34]:
reg.intercept_

46.73828651532999

In [55]:
len(cols)

8

In [54]:
len(reg.coef_)

8

In [56]:
keys = ["Intercept"]+cols

In [57]:
keys

['Intercept',
 'Bedroom',
 'Space',
 'Room',
 'Lot',
 'Tax',
 'Bathroom',
 'Garage',
 'Condition']

In [64]:
vals = [reg.intercept_]+list(reg.coef_)

In [65]:
dict(zip(keys,vals))

{'Intercept': 46.73828651532999,
 'Bedroom': -0.002472967587432514,
 'Space': -0.03856993662494396,
 'Room': 0.1707159419641411,
 'Lot': -0.12574823917851022,
 'Tax': 0.12157991371333883,
 'Bathroom': 0.03620228257597558,
 'Garage': -0.21606403329254645,
 'Condition': 0.12693120970418972}

In [63]:
len(reg.coef_)

8

In [61]:
len(vals)

8

In [40]:
reg.predict(np.array([3,1500,8,40,1000,2,1,0]).reshape(1, -1))

array([106.64801489])

In [51]:
df[(df["Bathroom"]==2) & (df["Bedroom"]==4)]

Unnamed: 0,Price,Bedroom,Space,Room,Lot,Tax,Bathroom,Garage,Condition
