In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

In [49]:
data = pd.read_excel('Sexual_Harassment_Data_for_Machine_Learning_Project.xlsx')
data.head()

Unnamed: 0,Area,Zone,Time,People.Frequency,Is.Police_Station,Is.Bar,Tier,Residence.Level,Class
0,Ramapuram,Aanandam Nagar,Morning,Medium,Yes,No,Middle,Medium,Safe
1,Ramapuram,Aanandam Nagar,Afternoon,High,Yes,No,Middle,Medium,Safe
2,Ramapuram,Aanandam Nagar,Evening,Medium,Yes,No,Middle,Medium,Safe
3,Ramapuram,Aanandam Nagar,Night,Low,Yes,No,Middle,Medium,Safe
4,Ramapuram,Ambal Nagar,Morning,Medium,No,No,Outer,Medium,Safe


In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Area               104 non-null    object
 1   Zone               104 non-null    object
 2   Time               104 non-null    object
 3   People.Frequency   104 non-null    object
 4   Is.Police_Station  104 non-null    object
 5   Is.Bar             104 non-null    object
 6   Tier               104 non-null    object
 7   Residence.Level    104 non-null    object
 8   Class              104 non-null    object
dtypes: object(9)
memory usage: 7.4+ KB


In [51]:
# Pipeline for numerical data

class NameDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(['Zone','Area'], axis=1)
    
class Imputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        imputer = SimpleImputer(strategy='median')
        imputer.fit(X)
        return imputer.transform(X)
    
class FeatureToNum(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        en_list = ["Is.Police_Station","Is.Bar"]
        temp = []
        idx = 0
        for str in en_list:
            for val in X[str].tolist():
                temp.append(1 if val=="Yes" else 0)
            X[en_list[idx]] = pd.Series(temp)            
            temp = []
            idx += 1
        
        for val in X["Class"].tolist():
            temp.append(1 if val=="Safe" else 0)
        X["Class"] = pd.Series(temp)            
        temp = []

        return X
        

    
class FeatureEncode(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        encoder = OneHotEncoder()
        timeMatrix = encoder.fit_transform(X[['Time']]).toarray()
        pfMatrix = encoder.fit_transform(X[['People.Frequency']]).toarray()
        tierMatrix = encoder.fit_transform(X[['Tier']]).toarray()
        resiMatrix = encoder.fit_transform(X[['Residence.Level']]).toarray()

        timeCol = X['Time'].unique()
        pfCol = X['People.Frequency'].unique()
        tierCol = X['Tier'].unique()
        resiCol = X['Residence.Level'].unique()

        for i in range(len(timeCol)):
            X[timeCol[i]] = timeMatrix.T[i]
        
        for i in range(len(pfCol)):
            X[pfCol[i]] = pfMatrix.T[i]
        
        for i in range(len(tierCol)):
            X[tierCol[i]] = tierMatrix.T[i]
        
        for i in range(len(resiCol)):
            X[resiCol[i]] = resiMatrix.T[i]

        return X.drop(['Time', 'People.Frequency', 'Tier', 'Residence.Level'], axis=1)
    

In [52]:
# Pipeline for text data

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

class TextFeatureEncode(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        countVec = CountVectorizer()
        cvCount =  countVec.fit_transform(X["Area"])
        tfidf = TfidfTransformer()
        tfidfCount = tfidf.fit_transform(cvCount)
        X["Area"] = tfidfCount.toarray().tolist()
        return X

In [61]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('name_dropper', NameDropper()),
    ('feature_encode', FeatureEncode()),
    ('feature_to_num', FeatureToNum()),
    # ('text_feature_encode', TextFeatureEncode()),
])

filteredData  = num_pipeline.fit_transform(data).head(30)



In [62]:
# Train test split

X = filteredData.drop(['Class'], axis=1)
y = filteredData['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
filteredData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Is.Police_Station  30 non-null     int64  
 1   Is.Bar             30 non-null     int64  
 2   Class              30 non-null     int64  
 3   Morning            30 non-null     float64
 4   Afternoon          30 non-null     float64
 5   Evening            30 non-null     float64
 6   Night              30 non-null     float64
 7   Medium             30 non-null     float64
 8   High               30 non-null     float64
 9   Low                30 non-null     float64
 10  Middle             30 non-null     float64
 11  Outer              30 non-null     float64
 12  Inner              30 non-null     float64
dtypes: float64(10), int64(3)
memory usage: 3.2 KB


In [63]:
# Train model

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [65]:
# Test model

y_pred = log_reg.predict(X_test)
y_pred

array([0, 0, 1, 1, 1, 1], dtype=int64)

In [66]:
# Evaluate model

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))


[[2 0]
 [0 4]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         4

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

1.0
