In [41]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [30]:
# Load the files
training_data_path = 'training_data.csv'
training_targets_path = 'training_data_targets.csv'
test_data_path = 'test_data.csv'

# Reading the CSV files
training_data = pd.read_csv(training_data_path)
training_targets = pd.read_csv(training_targets_path)
test_data = pd.read_csv(test_data_path)

In [31]:
# Normalizing numerical columns
numerical_cols = ['SQUARE_FT', 'LONGITUDE', 'LATITUDE']
scaler = MinMaxScaler()

training_data_normalized = training_data.copy()
test_data_normalized = test_data.copy()

training_data_normalized[numerical_cols] = scaler.fit_transform(training_data[numerical_cols])
test_data_normalized[numerical_cols] = scaler.transform(test_data[numerical_cols])

In [32]:
# Feature Engineering and Frequency Encoding for 'ADDRESS'
training_data_fe = training_data_normalized.copy()
test_data_fe = test_data_normalized.copy()

training_data_fe['CITY'] = training_data_fe['ADDRESS'].apply(lambda x: x.split(',')[-1].strip())
test_data_fe['CITY'] = test_data_fe['ADDRESS'].apply(lambda x: x.split(',')[-1].strip())

city_frequency = training_data_fe['CITY'].append(test_data_fe['CITY']).value_counts()
training_data_fe['CITY'] = training_data_fe['CITY'].map(city_frequency)
test_data_fe['CITY'] = test_data_fe['CITY'].map(city_frequency)

training_data_fe.drop('ADDRESS', axis=1, inplace=True)
test_data_fe.drop('ADDRESS', axis=1, inplace=True)

  city_frequency = training_data_fe['CITY'].append(test_data_fe['CITY']).value_counts()


In [33]:
# Aligning the training data with the training targets
X_aligned = training_data_fe.iloc[:-1]
y = training_targets.squeeze()

In [34]:
# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_aligned, y, test_size=0.2, random_state=42)

In [35]:
# Defining and training models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

results_aligned = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    mse = mean_squared_error(y_val, predictions)
    results_aligned[name] = np.sqrt(mse)  # Root Mean Squared Error (RMSE)

In [36]:
for results in results_aligned.items():
  print(results)

('Linear Regression', 571.533825735233)
('Decision Tree', 914.524388967419)
('Random Forest', 680.76504985028)
('Gradient Boosting', 582.3312278930188)


In [50]:
best_lr_regressor = LinearRegression()


In [51]:
test_data = pd.read_csv('test_data.csv')

In [52]:
test_data.head()

Unnamed: 0,UNDER_CONSTRUCTION,RERA,BHK_NO.,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE
0,0,0,3,1057.896332,1,1,"Nayabad,Kolkata",22.483471,88.417711
1,0,0,3,1340.588282,1,1,"Sector 42 Seawoods,Lalitpur",28.456809,77.099182
2,0,0,2,800.0,1,1,"Indirapuram,Ghaziabad",28.63676,77.36315
3,0,0,3,1800.327332,1,1,"Navratna Complex,Udaipur",24.58333,73.68333
4,1,0,2,903.024911,0,1,"Madhyamgram,Kolkata",22.7,88.45


In [53]:
encoder = OneHotEncoder()
test_data_encoded = encoder.fit_transform(test_data)

In [54]:
feature_names = encoder.get_feature_names_out(test_data.columns)
print("\nOne-Hot Encoded Feature Names: \n", feature_names)


One-Hot Encoded Feature Names: 
 ['UNDER_CONSTRUCTION_0' 'UNDER_CONSTRUCTION_1' 'RERA_0' ...
 'LATITUDE_106.784197' 'LATITUDE_132.764401' 'LATITUDE_136.0']


In [55]:
test_data_encoded_df = pd.DataFrame(test_data_encoded.toarray(), columns=feature_names)

print("\nOne-Hot Encoded Dataset:\n", test_data_encoded_df.head())


One-Hot Encoded Dataset:
    UNDER_CONSTRUCTION_0  UNDER_CONSTRUCTION_1  RERA_0  RERA_1  BHK_NO._1  \
0                   1.0                   0.0     1.0     0.0        0.0   
1                   1.0                   0.0     1.0     0.0        0.0   
2                   1.0                   0.0     1.0     0.0        0.0   
3                   1.0                   0.0     1.0     0.0        0.0   
4                   0.0                   1.0     1.0     0.0        0.0   

   BHK_NO._2  BHK_NO._3  BHK_NO._4  BHK_NO._5  BHK_NO._6  ...  \
0        0.0        1.0        0.0        0.0        0.0  ...   
1        0.0        1.0        0.0        0.0        0.0  ...   
2        1.0        0.0        0.0        0.0        0.0  ...   
3        0.0        1.0        0.0        0.0        0.0  ...   
4        1.0        0.0        0.0        0.0        0.0  ...   

   LATITUDE_91.748271  LATITUDE_91.754959  LATITUDE_91.76667  \
0                 0.0                 0.0                0.0 

In [56]:
test_predictions = best_lr_regressor.predict(X_train)

NotFittedError: This LinearRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Assuming you have your features X_train and target variable y_train
# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_aligned, y, test_size=0.2, random_state=42)

# Create a Linear Regression model
lr_regressor = LinearRegression()

# Fit the model to the training data
lr_regressor.fit(X_train, y_train)

# Now the model is fitted, and you can make predictions
test_predictions = lr_regressor.predict(X_val)


In [60]:
predictions_df = pd.DataFrame({'predicted_label': test_predictions})
predictions_df.to_csv('predictions.txt', index = False)