# Mission to Mars


## Importing Libraries

In [None]:
# REQUIRED LIBRARIES
# DO NOT TOUCH THE CODE IN THIS CELL
import pandas as pd
import numpy as np
import warnings
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

# Validation
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer, LabelEncoder
from sklearn.naive_bayes import GaussianNB


# Tuning
from sklearn.model_selection import GridSearchCV


# Feature Extraction
from sklearn.feature_selection import RFE

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

warnings.filterwarnings('ignore')

In [None]:
# HELPER FUNCTIONS
# DO NOT TOUCH THE CODE IN THIS CELL
# Analyze Data
def explore_data(df):
    # print("Number of Crop Examples and Attributes:", df.shape)
    print ("Our dataset contains:", "{:,}".format(df.shape[0]), "examples of recorded crops")
    print('\n')
    print('Dataset columns:',df.columns)
    print('\n')
    print('Data types of each columns: ', df.info())

def draw_barplot(data, variable, orient='v'):
    plt.figure(figsize=(150, 80))  # Set the size of the figure (adjust width and height as needed)
    if orient=='v':
        a = sns.countplot(data[variable], order=data[variable].value_counts().index)

    else:
        a = sns.countplot(y=data[variable], data=data, orient=orient, order=data[variable].value_counts().index)
    a.set_xlabel(variable,fontsize=50)
    a.set_ylabel("#",fontsize=50)
    a.tick_params(labelsize=76)

def calculate_average(field, label, df):
    average_temperature_per_label = df.groupby(label)[field].mean()
    print(average_temperature_per_label)
    

# Split training and validation set
def read_in_and_split_data(data, target):
    X = data.drop(target, axis=1)
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test
    
# Spot-Check Algorithms
def GetModel():
    Models = []
    Models.append(('SVM'  , SVC(probability=True)))
    return Models

# Train model
def fit_model(X_train, y_train,models):
    # Test options and evaluation metric
    num_folds = 10
    scoring = 'accuracy'

    results = []
    names = []
    for name, model in models:
        kfold = KFold(n_splits=num_folds, shuffle=True, random_state=0)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
    return names, results

# Save trained model
def save_model(model,filename):
    pickle.dump(model, open(filename, 'wb'))
    

# [Data Collection](http://)
In this workshop, we will skip the data collection step from the data science process, and use a readily available dataset: The Crop Recommendation Dataset.

This dataset would allow its users to build a smart model to recommend the most suitable crops to grow in a particular farm based on various parameters.

The dataset consist of 8 columns labelled as:

- **N** - ratio of Nitrogen content in soil
- **P** - ratio of Phosphorous content in soil
- **K** - ratio of Potassium content in soil
- **temperature** - temperature in degree Celsius
- **humidity** - relative humidity in %
- **ph** - ph value of the soil
- **rainfall** - rainfall in mm
- **Label** - the crop type planted


### Run the cell below to load and read the dataset

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/ladyhodhod/mission_to_mars_crop_recommendation_system/main/data/Crop_recommendation.csv')

### In order to display the content of this dataset, we can use the code in the cell below. When you run it, you will see random sample of recorded cases.

In [None]:
data.sample(5)

### **STOP!**
### Take a few minutes to read the content of the table above. Getting to know the data, is a crucial step in data science.

# [Data Exploration](http://)

Let's first rename the columns to have easier variable names.

Run the cell below and check what happens.

In [None]:
data.columns=['Nitrogen','Phosphorus','Potassium','Temperature','Humidity','pH','Rainfall','Crop']
data.head(5)

In [None]:
explore_data(data)

Let's now check how many examples of each crop type exist in our dataset.

Run the cell below and explore the crops we are studying

In [None]:
data["Crop"].value_counts()

### What do you observe?

## Double click on this cell to write your observations.

### Let's now calculate the average temperature needed for each crop type

In [None]:
# calculate_average('Temperature')
calculate_average('Temperature', 'Crop', data)

### What is the crop requiring the highest temperature?

### Now it is your turn! 
### Calculate the average Humidity level required for each crop type.
Hint: Think of using the `calculate_average()` function provided to you!

In [None]:
# write your code here

# [Train an AI system](http://)

Now it is time to train an AI recommendation system that will learn how to recommend crops based on the different characteristics provided.

We use Machine Learning algorithms to build such a system!

### Start by running the cell below

In [None]:
target ='Crop'
X_train, X_test, y_train, y_test = read_in_and_split_data(data, target)

models = GetModel()
names,results = fit_model(X_train, y_train,models)
pipeline = make_pipeline(MinMaxScaler(), GaussianNB())
model = pipeline.fit(X_train, y_train)

# [Predict unseen data](http://)

Let's now use our system to decide which crop type we should grow on Mars!

As showcased below, all we need is to set the values for the soil and weather parameters. DO NOT CHANGE ANYTHING ELSE!

In [None]:
nitrate = 80
phosphorus = 42
potassium = 43
temperature = 500.82312
humidity = 82.00284
ph = 8.50232
rainfall = 1000.93536

sample = [nitrate, phosphorus, potassium, temperature, humidity, ph, rainfall]
single_sample = np.array(sample).reshape(1,-1)
pred = model.predict(single_sample)
print(pred.item().title(), "is recommended by this AI stystem for your farm on Mars!")

YOUR TURN!
# Indicate the values for each parameter and run the cell below to test our AI system


In [None]:
nitrate =
phosphorus = 
potassium = 
temperature = 
humidity = 
ph = 
rainfall = 

# DO NOT CHANGE ANYTHING BELOW
sample = [nitrate, phosphorus, potassium, temperature, humidity, ph, rainfall]
single_sample = np.array(sample).reshape(1,-1)
pred = model.predict(single_sample)
print(pred.item().title(), "is recommended by this AI stystem for your farm on Mars!")

# Conclusion

We hope this work provided you with an overview on what data science is and how useful and important it is in helping in a decision making process.

We also hope that you enjoyed this experience and that it inspired you in choosing Information Systems and Data Science as a major to study at CMUQ.
