In [None]:
import pandas as pd

## Collection of Data

In [None]:
legitimate_urls = pd.read_csv("legitimate-urls.csv")
phishing_urls = pd.read_csv("phishing-urls.csv")

In [None]:
legitimate_urls.head(10)
phishing_urls.head(10)

## Data PreProcessing
#### Data is in two data frames so we merge them to make one dataframe
Note: two dataframes has same column names

In [None]:
urls = legitimate_urls.append(phishing_urls)


In [None]:
urls.head(5)

In [None]:
urls.columns

#### Removing Unnecessary columns

In [None]:
urls = urls.drop(urls.columns[[0,3,5]],axis=1)

#### Since we merged two dataframes top 1000 rows will have legitimate urls and bottom 1000 rows will have phishing urls. So if we split the data now and create a model for it will overfit so we need to shuffle the rows before splitting the data into training set and test set

In [None]:
# shuffling the rows in the dataset so that when splitting the train and test set are equally distributed
urls = urls.sample(frac=1).reset_index(drop=True)

#### Removing class variable from the dataset

In [None]:
urls_without_labels = urls.drop('label',axis=1)
urls_without_labels.columns
labels = urls['label']


#### splitting the data into train data and test data

In [None]:
from sklearn.model_selection import train_test_split
data_train, data_test, labels_train, labels_test = train_test_split(urls_without_labels, labels, test_size=0.20, random_state=100)

In [None]:
print(len(data_train),len(data_test),len(labels_train),len(labels_test))

In [None]:
labels_train.value_counts()

#labels_train[labels_train == 0].count()
#labels_train[labels_train == 1].count()

In [None]:
labels_test.value_counts()

#### creating the model and fitting the data into the model

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(data_train,labels_train)

#### predicting the result for test data

In [None]:
pred_label = model.predict(data_test)

In [None]:
#print(pred_label),print(list(labels_test))

#### creating confusion matrix and checking the accuracy

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(labels_test,pred_label)
print(cm)
accuracy_score(labels_test,pred_label)