In [1]:
[i for i in range(10)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('url_spam_classification.csv')
df.head(5)

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


In [3]:
df['is_spam'] = df['is_spam'].apply(lambda x: 1 if x == True else 0)
df.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,1
1,https://www.hvper.com/,1
2,https://briefingday.com/m/v4n3i4f3,1
3,https://briefingday.com/n/20200618/m#commentform,0
4,https://briefingday.com/fan,1


In [31]:
df['len_url'] = df['url'].apply(lambda x : len(x))
df['contains_subscribe'] = df['url'].apply(lambda x : 1 if "subscribe" in x else 0)
df['contains_hash'] = df['url'].apply(lambda x : 1 if "#" in x else 0)
df['num_digits'] = df['url'].apply(lambda x : len("".join(char for char in x if char.isdigit())) )
df['non_https'] = df['url'].apply(lambda x : 1 if "https" in x else 0)
df['num_words'] = df['url'].apply(lambda x : len(x.split("/")))
df['contains_?'] = df['url'].apply(lambda x : 1 if "?" in x else 0)
df['contains_www'] = df['url'].apply(lambda x : 1 if "www" in x else 0)
df.head(5)

Unnamed: 0,url,is_spam,len_url,contains_subscribe,contains_hash,num_digits,non_https,num_words,contains_?,contains_www
0,https://briefingday.us8.list-manage.com/unsubs...,1,51,1,0,1,1,4,0,0
1,https://www.hvper.com/,1,22,0,0,0,1,4,0,1
2,https://briefingday.com/m/v4n3i4f3,1,34,0,0,4,1,5,0,0
3,https://briefingday.com/n/20200618/m#commentform,0,48,0,1,8,1,6,0,0
4,https://briefingday.com/fan,1,27,0,0,0,1,4,0,0


In [5]:
X = df.drop(['url', 'is_spam'], axis = 1).values
y = df['is_spam'].values

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [8]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [9]:
training_data_accuracy

0.7845198159167917

In [10]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7855095917197666


## Hyperparameter Tuning

In [11]:
penalty = ['l1', 'l2']
C = np.logspace(-4,4,20)

hyperparameters = dict(penalty=penalty, C=C)

logreg = LogisticRegression()

#cv itu cross validation
clf = GridSearchCV(logreg, hyperparameters, cv=10)

best_model = clf.fit(X_train, y_train)

print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

X_train_prediction = best_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

# accuracy score on the test data
X_test_prediction = best_model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Best Penalty: l2
Best C: 78.47599703514607
Accuracy score of the training data :  0.7845282446351208
Accuracy score of the test data :  0.7854421631098075


### use scaler

In [12]:
sc = MinMaxScaler()
pipe_clf = Pipeline([('scaler', sc), ('clf', logreg)])
pipe_clf.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()), ('clf', LogisticRegression())])

In [13]:
X_train_prediction = pipe_clf.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

# accuracy score on the test data
X_test_prediction = pipe_clf.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the training data :  0.7842332394936026
Accuracy score of the test data :  0.7853073058898891


## Use ExtraTreesClassifier

## Random Forest Classifier

In [29]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

X_train_prediction = clf.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

X_test_prediction = clf.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the training data :  0.8932081387704186
Accuracy score of the test data :  0.8908667947810256


In [30]:
# import joblib
# joblib.dump(clf, 'final_model.pkl')

['final_model.pkl']