## Loading Data from CSV file

In [4]:
import pandas as pd
data = pd.read_csv('sales_returns.csv')
data.shape

(1302, 25)

## Selecting Input Variables ( Predictors)

In [6]:
data.tail(2)

Unnamed: 0.1,Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,cust_age,cust_gender,...,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit,returned
1300,9991,9992,CA-2018-121258,26/02/18,03/03/18,Standard Class,DB-13060,Dave Brooks,23,Female,...,West,TEC-PH-10003645,Technology,Phones,Aastra 57i VoIP phone,258.576,2,0.2,19.3932,True
1301,9992,9993,CA-2018-121258,26/02/18,03/03/18,Standard Class,DB-13060,Dave Brooks,21,Female,...,West,OFF-PA-10004041,Office Supplies,Paper,"It's Hot Message Books with Stickers, 2 3/4"" x 5""",29.6,4,0.0,13.32,True


In [7]:
X = data.loc[:,['cust_age','cust_gender','city','sub_category']]
y = data.returned
X.head()

Unnamed: 0,cust_age,cust_gender,city,sub_category
0,19,Female,San Francisco,Art
1,19,Female,San Francisco,Phones
2,19,Female,San Francisco,Binders
3,20,Female,Troy,Storage
4,20,Female,Troy,Paper


In [8]:
data.info

<bound method DataFrame.info of       Unnamed: 0  row_id        order_id order_date ship_date       ship_mode  \
0             18      19  CA-2015-143336   27/08/15  01/09/15    Second Class   
1             19      20  CA-2015-143336   27/08/15  01/09/15    Second Class   
2             20      21  CA-2015-143336   27/08/15  01/09/15    Second Class   
3             55      56  CA-2017-111682   17/06/17  18/06/17     First Class   
4             56      57  CA-2017-111682   17/06/17  18/06/17     First Class   
...          ...     ...             ...        ...       ...             ...   
1297        9957    9958  US-2015-143287   11/11/15  17/11/15  Standard Class   
1298        9958    9959  US-2015-143287   11/11/15  17/11/15  Standard Class   
1299        9990    9991  CA-2018-121258   26/02/18  03/03/18  Standard Class   
1300        9991    9992  CA-2018-121258   26/02/18  03/03/18  Standard Class   
1301        9992    9993  CA-2018-121258   26/02/18  03/03/18  Standard Class

## Data Preparation

In [9]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
X.cust_gender = enc.fit_transform(X.cust_gender)
X.city = enc.fit_transform(X.city)
X.sub_category = enc.fit_transform(X.sub_category)
X.head()

Unnamed: 0,cust_age,cust_gender,city,sub_category
0,19,0,156,2
1,19,0,156,13
2,19,0,156,3
3,20,0,173,14
4,20,0,173,12


## Spliting Data to Train and Test 

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [11]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 326 entries, 769 to 471
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   cust_age      326 non-null    int64
 1   cust_gender   326 non-null    int32
 2   city          326 non-null    int32
 3   sub_category  326 non-null    int32
dtypes: int32(3), int64(1)
memory usage: 8.9 KB


## Machine Learning Modeling - Training

In [12]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train,y_train)  #training

## Machine Learning Model Evaluation - Testing

In [None]:
from sklearn.metrics import accuracy_score
y_predict = model.predict(X_test)
print(accuracy_score(y_test,y_predict))

## Model Evaluation - Confusion Matrix

In [None]:
pd.crosstab(y_test,y_predict)

In [None]:
model.predict([[40,0,100,15]]) # cust_age, gender, city, sub_category - input features used to train the model

In [None]:
model.predict([[20,1,50,10]])