In [None]:
# importing modules and packages 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
from sklearn import model_selection
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import mean_squared_error, mean_absolute_error 
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler as Scaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
seed = 101
# import all libraries needed

In [None]:
df = pd.read_csv("data/actual.csv")
df.head()

In [None]:
df = df[df["Actual_Price"]>= 50]

In [None]:
df.isnull().sum()

In [None]:
df["price_date"] = pd.to_datetime(df["price_date"], format="%m/%d/%Y")

In [None]:
#df = df.sort_values('price_date', ascending=False)

In [None]:
df["price_date"] = df["price_date"].dt.strftime('%Y-%m-%d')
df.head()

In [None]:
df0 = df.copy()

In [None]:
#converting string columns to numerical
le = preprocessing.LabelEncoder()
df["price_date"] = le.fit_transform(df["price_date"])
df["parish"] = le.fit_transform(df["parish"])
df["commodity"] = le.fit_transform(df["commodity"])
df["type"] = le.fit_transform(df["type"])
df["Category"] = le.fit_transform(df["Category"])
df["supply"] = le.fit_transform(df["supply"])
df["grade"] = le.fit_transform(df["grade"])

In [None]:
# apply normalization techniques by Column 1 
#column = 'Actual_Price'
#df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())  

In [None]:
df.to_csv("before.csv")

In [None]:
x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_scaled = pd.DataFrame(x_scaled, columns = ['price_date', 'parish', 'commodity', 'type', 'Category', 'supply', 'grade', 'Actual_Price'] )
df= df_scaled.copy()

In [None]:
#normalized_df=(df-df.min())/(df.max()-df.min())
#df = normalized_df.copy()

In [None]:
df.head()

In [None]:
#df = normalized_df.copy()

In [None]:
#using One Hot encoder to convert string columns into numerical columns
#df_encoded = pd.get_dummies(df, columns=['parish', 'commodity', 'type', 'Category', 'supply', 'grade'])
#df_encoded.head()

In [None]:
#df = df_encoded.copy()

In [None]:
d = dict.fromkeys(df.select_dtypes(np.int64).columns, np.int32)

In [None]:
df = df.astype(d)

In [None]:
df['Actual_Price'] = df['Actual_Price'].astype(float)
df.info()

In [None]:
df.to_csv('test.csv')

In [None]:
corr = df.corr()
corr

In [None]:
#sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)

In [None]:
df.hist(bins=50, figsize=(25,10))
plt.show()

In [None]:
df.columns

In [None]:
X = np.array(df.drop(['Actual_Price'], axis=1))
y = np.array(df['Actual_Price'])

In [None]:
seed = 42
scoring = 'r2'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=seed)

In [None]:
#scaling x
#mm_scaler = Scaler()
#mm_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
#mm_scaler.fit(X_train)
#X_train = mm_scaler.transform(X_train)
#X_test = mm_scaler.transform(X_test)

In [None]:
# Spot Check Algorithms
models = []
models.append(('LNR', LinearRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RGR', RandomForestRegressor()))
#models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

In [None]:
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=None)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
#tuned parameters for random forest regressor
from sklearn.model_selection import GridSearchCV
tuned_parameters = {
    'max_depth' : [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],        
    'n_estimators': [10,25,30,50,100,200]   
}
rgr_model = RandomForestRegressor()
gridSearch = GridSearchCV(rgr_model, tuned_parameters, cv=10, scoring='r2')
gridSearch.fit(X_train, y_train)

In [None]:
"""
from sklearn.model_selection import GridSearchCV
tuned_parameters = {
    'solver' : ['svd', 'lsqr', 'eigen'],
    'shrinkage': ['auto', 0.1, 0.5],        
    'tol' : [0.1, 0.3, 0.5]   
}
lda_model = LinearDiscriminantAnalysis()
gridSearch = GridSearchCV(lda_model, tuned_parameters, cv=10, scoring='r2')
gridSearch.fit(X_train, y_train)"""

In [None]:
gridSearch.best_score_

In [None]:
gridSearch.best_params_

In [None]:
rgr_model = gridSearch.best_estimator_
print(rgr_model)

In [None]:
rgr_model.fit(X_train, y_train)

In [None]:
rgr_model.oob_score_

In [None]:
# making predictions 
predictions = rgr_model.predict(X_test)

In [None]:
# model evaluation 
print('mean_squared_error : ', mean_squared_error(y_test, predictions)) 
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))

In [None]:
total_predictions = rgr_model.predict(X)
total_predictions

In [None]:
df0["Predictions"] = total_predictions
df0.head()

In [None]:
#denormalizinf predictions
#column = "Predictions"
#df0["Predictions"] = (df0[column] * (df0[column].max() - df0[column].min()) + df0[column].min())
#df0.head()

In [None]:
df0.to_csv("predictions.csv")

### Linear Discriminate Analysis


In [None]:
#tuned parameters for Linear DiscriminateAnalysis
from sklearn.model_selection import GridSearchCV
tuned_parameters = {
    'solver' : ['svd', 'lsqr', 'eigen'],
    'min_samples_split': [2, 4],        
    'max_features' : [None]   
}
rgr_model = RandomForestRegressor()
gridSearch = GridSearchCV(rgr_model, tuned_parameters, cv=10, scoring='r2')
gridSearch.fit(X_train, y_train)

In [None]:
# making predictions 
predictions = rgr.predict(X_test)

In [None]:
# model evaluation 
print('mean_squared_error : ', mean_squared_error(y_test, predictions)) 
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))

In [None]:
#model = LinearRegression()
model = RandomForestRegressor()

In [None]:
# fitting the model 
model.fit(X_train_scaled, y_train)

In [None]:
# making predictions 
predictions = model.predict(X_test_scaled)

In [None]:
# model evaluation 
print('mean_squared_error : ', mean_squared_error(y_test, predictions)) 
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))

In [None]:
mm_scaler = Scaler()
mm_scaler.fit(X)
X_full = mm_scaler.transform(X)

In [None]:
total_predictions = model.predict(X_full)

In [None]:
df0["Predictions"] = total_predictions
df0.head()

In [None]:
import pickle

filename = "farmgate_model.pickle"

#save model
pickle.dump(model, open(filename, "wb"))

# load model
loaded_model = pickle.load(open(filename, "rb"))

In [None]:
#filterting for unique rows without dates

In [None]:
df = pd.read_csv("actual.csv")
df = df.drop(["price_date", "Actual_Price"], axis =1)
df.head()

In [None]:
dff = df.drop_duplicates()
dff.shape

In [None]:
#creating a list of dates for next three years
base = datetime.datetime.today()
#date_list = [base + datetime.timedelta(days=x) for x in range(numdays)]

In [None]:
base
