# Explore here

In [63]:
# Libraries
import pandas as pd
import numpy as np
import regex as re

import matplotlib.pyplot as plt
import seaborn as sns

from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC

import warnings

In [64]:
rawdata = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv')

rawdata.to_csv('../data/raw/url_spam_raw.csv')
rawdata.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


In [65]:
rawdata['is_spam'] = rawdata['is_spam'].astype(int) 

rawdata.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,1
1,https://www.hvper.com/,1
2,https://briefingday.com/m/v4n3i4f3,1
3,https://briefingday.com/n/20200618/m#commentform,0
4,https://briefingday.com/fan,1


In [66]:
rawdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2999 non-null   object
 1   is_spam  2999 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 35.3+ KB


In [67]:
rawdata.duplicated().sum()

dupes = rawdata[rawdata.duplicated()]

print(dupes)

                                                    url  is_spam
60    https://briefingday.us8.list-manage.com/unsubs...        1
61                               https://www.hvper.com/        1
62                   https://briefingday.com/m/v4n3i4f3        1
64                          https://briefingday.com/fan        1
113                         https://briefingday.com/fan        1
...                                                 ...      ...
2971  https://www.cnbc.com/2020/06/29/stock-market-f...        0
2972                      https://thehustle.co/account/        1
2973                              https://thehustle.co/        1
2979               https://www.bloomberg.com/tosv2.html        1
2995                      https://www.youtube.com/watch        1

[630 rows x 2 columns]


In [68]:
briefingday = rawdata[rawdata['url'].str.contains('briefingday')]
print(f"briefingday count = {len(briefingday)}")

yt = rawdata[rawdata['url'].str.contains('youtube')]
print(f"youtube count = {len(yt)}")

briefingday count = 69
youtube count = 104


In [69]:
yt_dupes = dupes[dupes['url'].str.contains('youtube')]
print(f"yt dupes count = {len(yt_dupes)}")

yt dupes count = 14


##### No quitamos los duplicados, ya que muchos duplicados realmente son links distintos y pueden aportar a la clasificación del modelo

In [70]:
https = 0
http = 0
for row in rawdata['url']:
    if str(row).startswith('https://'):
        https += 1
    elif str(row).startswith('http://'):
        http += 1
    else:
        print(row)


print(f"https:// = {https}")
print(f"http:// = {http}")
print(f"total = {https + http}")
print(f"dataset length = {len(rawdata)}")

https:// = 2945
http:// = 54
total = 2999
dataset length = 2999


In [71]:
def clean_htps(columna):
    # defino la nueva columna
    new_column = []
    # cambio y sustituyo los inicios de la columna raw
    for row in columna:
        if str(row).startswith('https://'):
            row = row.replace('https://', '')
            new_column.append(row) # guardo el resultado en la lista vacía
        elif str(row).startswith('http://'):
            row = row.replace('http://', '')
            new_column.append(row)
        else:
            new_column.append(row)

    final_column = []
    # ahora que ya no hay https, quito el www del inicio
    for row in new_column:
        if str(row).startswith('www.'):
            row = row.replace('www.', '')
            final_column.append(row)
        else: 
            final_column.append(row)


    return final_column

rawdata['url_stripped'] = clean_htps(rawdata['url'])
print(rawdata)

                                                    url  is_spam  \
0     https://briefingday.us8.list-manage.com/unsubs...        1   
1                                https://www.hvper.com/        1   
2                    https://briefingday.com/m/v4n3i4f3        1   
3      https://briefingday.com/n/20200618/m#commentform        0   
4                           https://briefingday.com/fan        1   
...                                                 ...      ...   
2994  https://www.smartcitiesworld.net/news/news/dee...        0   
2995                      https://www.youtube.com/watch        1   
2996  https://techcrunch.com/2019/07/04/an-optimisti...        0   
2997  https://www.technologyreview.com/2019/12/20/13...        0   
2998       https://www.bbc.com/news/technology-51018758        0   

                                           url_stripped  
0           briefingday.us8.list-manage.com/unsubscribe  
1                                            hvper.com/  
2        