<a href="https://colab.research.google.com/github/jenbam/python_nltk_data_preprocessing/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder # converts our text in numerical form
from sklearn.model_selection import train_test_split

In [2]:
# getting data set
df = pd.read_csv("/content/sample_data/california_housing_train.csv")

In [3]:
df.shape

(17000, 9)

In [4]:
df.head

<bound method NDFrame.head of        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -114.31     34.19                15.0       5612.0          1283.0   
1        -114.47     34.40                19.0       7650.0          1901.0   
2        -114.56     33.69                17.0        720.0           174.0   
3        -114.57     33.64                14.0       1501.0           337.0   
4        -114.57     33.57                20.0       1454.0           326.0   
...          ...       ...                 ...          ...             ...   
16995    -124.26     40.58                52.0       2217.0           394.0   
16996    -124.27     40.69                36.0       2349.0           528.0   
16997    -124.30     41.84                17.0       2677.0           531.0   
16998    -124.30     41.80                19.0       2672.0           552.0   
16999    -124.35     40.54                52.0       1820.0           300.0   

       population  ho

In [5]:
le = LabelEncoder()

In [6]:
df_encoded = df.apply(le.fit_transform, axis=0)

In [7]:
df_encoded.head

<bound method NDFrame.head of        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0            826       165                  14         4529            1258   
1            825       186                  18         5045            1605   
2            824       115                  16          526             173   
3            823       110                  13         1285             336   
4            823       103                  19         1238             325   
...          ...       ...                 ...          ...             ...   
16995          3       759                  51         2000             393   
16996          2       770                  35         2132             527   
16997          1       836                  16         2458             530   
16998          1       833                  18         2453             551   
16999          0       755                  51         1603             299   

       population  ho

In [8]:
df = df_encoded.values

In [9]:
X = df[: , 1: ]
X

array([[ 165,   14, 4529, ...,  471,  539,  246],
       [ 186,   18, 5045, ...,  462, 1106,  376],
       [ 115,   16,  526, ...,  116,  781,  432],
       ...,
       [ 836,   16, 2458, ...,  455, 3901,  611],
       [ 833,   18, 2453, ...,  477, 1404,  433],
       [ 755,   51, 1603, ...,  269, 3850,  521]])

In [10]:
y = df[: , 0]
y

array([826, 825, 824, ...,   1,   1,   0])

In [11]:
X_train, y_train, X_test, y_test = train_test_split( X, y ,  test_size=0.20, random_state=42)

# Posterior Probability
# Likelihood
# Prior Probability

In [12]:
# Prior Probability

def prior_probab(y_train, label):    # label refers to x and zero values
  m = y_train.shape[0] # first value
  s = np.sum(y_train == label)  # y value has only class values

  return m/s

In [13]:
# Likelihood (for this we need conditional probability)
def cond_probab(X_train,y_train, feature_col, feature_val, label):
  X_filtered = X_train[y_train == label] # email values which are either spam or not spam
  num = np.sum(X_filtered[:,feature_col] == feature_val)
  denom = X_filtered.shape[0]
  return float(num/denom)
  


In [14]:
def predict(X_train, y_train, X_test):
  classes = np.unique(y_train)
  n_features = X_train.shape[1]
  posterior_probab = [] # probability of word being spam or not

  for label in classes:   # classes ={0,1}
    likelihood = 1.0  # initialise it randomly
    for feature in range(n_features):
      cond = cond_probab(X_train,y_train, feature, X_test[feature], label)
      likelihood = likelihood * cond

  prior = prior_probab(y_train, label)
  posterior = likelihood * prior  

  posterior_probab.append(posterior)  

  pred = np.argmax(posterior_probab)

  return pred 
  
     


In [15]:
#accuracy 
def accuracy(X_train, y_train, X_test, y_test):
  pred = []

  for i in range(X_test.shape[0]):
    p = predict(X_train, y_train, X_test[i])
    pred.append(p)

    y_pred = np.array(pred)
    acc = np.sum(y_pred == y_test)/y_pred.shape[0]

    return acc 


In [16]:
accuracy(X_train, y_train, X_test, y_test)

IndexError: ignored