In [1]:
#importing necessary libraries
import pandas as pd
import numpy as np
import warnings                   # To ignore the warnings
warnings.filterwarnings("ignore")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/final-data/train_final.csv
/kaggle/input/final-data/test_final.csv


In [2]:
# Reading the test data files
train_final_merged = pd.read_csv('/kaggle/input/final-data/train_final.csv')
test_final_merged = pd.read_csv('/kaggle/input/final-data/test_final.csv')

In [3]:
# checking the final merged test dataset
test_final_merged.head()

Unnamed: 0.1,Unnamed: 0,farm_id,ingredient_type,id,operations_commencing_year,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,cloudiness,wind_direction,pressure_sea_level,precipitation,wind_speed,month,day,hour
0,0,fid_110884,ing_w,0,new,7,690.455096,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
1,1,fid_90053,ing_w,1,new,7,252.69616,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
2,2,fid_17537,ing_w,2,moderate,7,499.446528,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
3,3,fid_110392,ing_w,3,moderate,7,2200.407555,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
4,4,fid_62402,ing_w,4,moderate,7,10833.140121,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0


In [4]:
# checking the  data types of final merged tarin dataset
train_final_merged.dtypes

Unnamed: 0                      int64
farm_id                        object
ingredient_type                object
yield                         float64
operations_commencing_year     object
num_processing_plants           int64
farm_area                     float64
farming_company                object
deidentified_location          object
temp_obs                      float64
cloudiness                    float64
wind_direction                float64
pressure_sea_level            float64
precipitation                 float64
wind_speed                    float64
month                           int64
day                             int64
hour                            int64
dtype: object

* farm_id is unique just like my id. my performance is measured based on my skills on based on my id right! same here the prediction of yield is depend on loaction and weather condtions on id. so let's drop it

In [5]:
# dropping the farm_id and num_processing pants from test and train datasets
train_final_merged.drop(['farm_id','Unnamed: 0'], axis=1, inplace=True)
test_final_merged.drop(['farm_id','Unnamed: 0'], axis=1, inplace=True)

In [6]:
# assigning object columns in category columns
cat_cols=['ingredient_type','deidentified_location',
          'operations_commencing_year','farming_company']
cat_cols

['ingredient_type',
 'deidentified_location',
 'operations_commencing_year',
 'farming_company']

In [7]:
# converting object columns to category columns
train_final_merged[cat_cols] = train_final_merged[cat_cols].astype("category")
test_final_merged[cat_cols] = test_final_merged[cat_cols].astype("category")

In [8]:
# checking value counts operations_commencing_year of in train data
train_final_merged.operations_commencing_year.value_counts()

new         13672001
old          4403341
moderate     2436190
Name: operations_commencing_year, dtype: int64

In [9]:
# checking value counts operations_commencing_year of in train data
test_final_merged.operations_commencing_year.value_counts()

new         13929055
old          4662664
moderate     2563078
Name: operations_commencing_year, dtype: int64

In [10]:
# checking value counts of locations in train data
train_final_merged.deidentified_location.value_counts()

location 2532    2790470
location 8421    2703006
location 5489    2580537
location 5410    2525792
location 5290    2439148
location 5150    1760872
location 7369    1085446
location 6364     819743
location 5833     799257
location 565      667989
location 5677     575401
location 959      552034
location 1784     411313
location 4525     359642
location 7048     323623
location 868      117259
Name: deidentified_location, dtype: int64

In [11]:
# checking value counts of locations in test data
test_final_merged.deidentified_location.value_counts()

location 2532    2785362
location 8421    2703441
location 5489    2584200
location 5410    2548869
location 5290    2478797
location 5150    2147040
location 7369    1138800
location 5833     849623
location 6364     822030
location 565      700560
location 5677     621960
location 959      551439
location 1784     437150
location 4525     346332
location 7048     323750
location 868      115444
Name: deidentified_location, dtype: int64

# Model Building and Predictions

In [12]:
# deviding the data into x and y
from sklearn.model_selection import train_test_split
X = train_final_merged.drop(['yield'] , axis = 1)
y = train_final_merged['yield']

In [13]:
# splitting the data into train and test in the ration of 70% & 30%
X_train, X_test, y_train, y_test =   train_test_split(X, y, test_size=0.30,random_state = 123)

In [14]:
# checking the shapes
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(14358072, 15) (6153460, 15)
(14358072,) (6153460,)


<div class="alert alert-block alert-info"><b> 
  Dummifing categorical features into numerical using get dummies
</b></div> 

In [15]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
X_train.deidentified_location = label.fit_transform(X_train.deidentified_location)
X_test.deidentified_location = label.transform(X_test.deidentified_location)
X_train.farming_company = label.fit_transform(X_train.farming_company)
X_test.farming_company = label.transform(X_test.farming_company)

In [16]:
## Convert Categorical Columns to numerical
X_train = pd.get_dummies(X_train, columns=['ingredient_type','operations_commencing_year'],drop_first=True)

X_test = pd.get_dummies(X_test, columns=['ingredient_type','operations_commencing_year'],drop_first=True)

In [17]:
# scaling the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_cols = ["farm_area", "wind_speed", "precipitation", "temp_obs",
            "pressure_sea_level","wind_direction",'num_processing_plants']

# scale on train
X_train[num_cols] =scaler.fit_transform(X_train[num_cols])

# scale on test
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [18]:
# checking the x train data
X_train.head()

Unnamed: 0,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,cloudiness,wind_direction,pressure_sea_level,precipitation,wind_speed,month,day,hour,ingredient_type_ing_x,ingredient_type_ing_y,ingredient_type_ing_z,operations_commencing_year_new,operations_commencing_year_old
2022715,0.32,0.064231,1,15,0.461235,2.0,0.111111,0.746442,0.002907,0.189474,3,19,4,0,0,0,0,1
16368260,0.08,0.104,8,13,0.862024,0.0,0.527778,0.556274,0.002907,0.163158,8,6,23,1,0,0,1,0
1227778,0.04,0.053723,11,10,0.471748,2.0,0.027778,0.623545,0.002907,0.3,3,5,11,0,0,0,0,0
2440463,0.28,0.039066,14,15,0.441524,2.0,0.444444,0.661061,0.002907,0.189474,12,20,21,0,0,0,0,1
16311547,0.08,0.644207,11,13,0.722733,0.0,0.527778,0.614489,0.002907,0.078947,7,30,7,1,0,0,1,0


In [19]:
# checking the x test data
X_test.head()

Unnamed: 0,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,cloudiness,wind_direction,pressure_sea_level,precipitation,wind_speed,month,day,hour,ingredient_type_ing_x,ingredient_type_ing_y,ingredient_type_ing_z,operations_commencing_year_new,operations_commencing_year_old
17027306,0.08,0.575444,11,13,0.65046,2.0,0.0,0.697283,0.002907,0.0,11,4,1,0,1,0,1,0
7082224,0.08,0.079748,7,1,0.642576,2.0,0.388889,0.600259,0.002907,0.431579,9,25,4,0,0,0,1,0
17343599,0.08,0.05122,1,13,0.568988,2.0,0.0,0.592497,0.002907,0.0,12,16,18,0,0,0,1,0
11637690,0.0,0.010988,11,8,0.540079,2.0,0.972222,0.692109,0.002907,0.431579,12,30,9,0,0,0,1,0
11350730,0.0,0.000271,11,8,0.751643,4.0,0.25,0.64295,0.002907,0.268421,6,21,23,0,0,0,1,0


In [20]:
# checking the data types x_train data
X_train.dtypes

num_processing_plants             float64
farm_area                         float64
farming_company                     int64
deidentified_location               int64
temp_obs                          float64
cloudiness                        float64
wind_direction                    float64
pressure_sea_level                float64
precipitation                     float64
wind_speed                        float64
month                               int64
day                                 int64
hour                                int64
ingredient_type_ing_x               uint8
ingredient_type_ing_y               uint8
ingredient_type_ing_z               uint8
operations_commencing_year_new      uint8
operations_commencing_year_old      uint8
dtype: object

# Model Building

<div class="alert alert-block alert-info"><b> 
  Linear Regression
</b>

In [21]:
# linear regression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
train_pred =reg.predict(X_train)
test_pred = reg.predict(X_test)
lr_train_pred = np.sqrt(mean_squared_error(y_train, train_pred))
lr_test_pred = np.sqrt(mean_squared_error(y_test, test_pred))
print('RMSE of Linear Regressor on train : ',lr_train_pred)
print('RMSE of Linear Regressor on test : ',lr_test_pred)

RMSE of Linear Regressor on train :  152198.99657768806
RMSE of Linear Regressor on test :  151537.8485386369


<div class="alert alert-block alert-info"><b> 
   here i can able run only linear regression so that's why predictions also done on linear regression
</b></div> 

## Lets work on test data

In [22]:
test_final_merged.shape

(21154797, 16)

as we can see above shape doesn't match with the submission file shape so let's drop some rows

In [23]:
# Number of rows to drop 
n = 305997
  
# Removing last n rows 
Test_Data = test_final_merged.iloc[:-n] 
  
# Printing dataframe 
Test_Data.head(10)

Unnamed: 0,ingredient_type,id,operations_commencing_year,num_processing_plants,farm_area,farming_company,deidentified_location,temp_obs,cloudiness,wind_direction,pressure_sea_level,precipitation,wind_speed,month,day,hour
0,ing_w,0,new,7,690.455096,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
1,ing_w,1,new,7,252.69616,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
2,ing_w,2,moderate,7,499.446528,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
3,ing_w,3,moderate,7,2200.407555,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
4,ing_w,4,moderate,7,10833.140121,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
5,ing_w,5,moderate,7,743.224,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
6,ing_w,6,moderate,7,2594.409178,Dole Food Company,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
7,ing_w,7,moderate,7,11248.137822,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
8,ing_x,8,moderate,7,11248.137822,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0
9,ing_w,9,moderate,7,5649.338527,Obery Farms,location 7369,17.8,4.0,100.0,1021.4,0.0,3.6,1,1,0


In [24]:
# converting categorical to numerical
test_final_merged = pd.get_dummies(Test_Data, columns=['ingredient_type',
                                                       'operations_commencing_year'],drop_first=True)

In [25]:
# categorical to numerical
test_final_merged.deidentified_location = label.fit_transform(test_final_merged.deidentified_location)
test_final_merged.farming_company = label.fit_transform(test_final_merged.farming_company)

In [26]:
# scale on test
test_final_merged[num_cols] = scaler.transform(test_final_merged[num_cols])

In [27]:
# selecting remaing columns except id column
fin = test_final_merged.loc[:, test_final_merged.columns != 'id']

In [28]:
# checking the final test dataset dimensions
fin.shape

(20848800, 18)

# Predictions

In [29]:
# predicting the test data with best model
test_pred = reg.predict(fin)

In [30]:
# creating a dataframe with id column and yield column
sub  = pd.DataFrame({'id': test_final_merged.id, 'yield': test_pred})

In [31]:
# checking the shape of the final dataframe 
sub.shape

(20848800, 2)

In [32]:
# checking the head of the dataframe
sub.head()

Unnamed: 0,id,yield
0,0,-1639.123589
1,1,-1774.214708
2,2,-2633.435929
3,3,-2108.524362
4,4,555.511503


In [33]:
# checking the last 5 rows of the dataframe
sub.tail()

Unnamed: 0,id,yield
20848795,25259065,2992.577367
20848796,25259066,2391.74415
20848797,25259067,2648.680699
20848798,25259068,3096.647988
20848799,25259069,2241.672629


In [34]:
# creating submission file
sub.to_csv('submissionwhole.csv', index = None)

In [35]:
num_negative_yield = (sub['yield'] < 0).sum()
num_negative_yield

9655646