# User Input
Please input the name of the csv file below:

In [None]:
####################################################
filename =  "../Data/Unprocessed_data/Bitcoin_13395.csv"
####################################################

## Importing libraries & data

In [None]:
import pandas as pd
import numpy as np
import re 
from tqdm import tqdm
tqdm.pandas()
import string

data = pd.read_csv(filename)
data.head()

## Cleaning the data

In [None]:
# Formating the columns
data.date = pd.to_datetime(data.date)
data = data[['username', 'date', 'hashtags', 'mentions', 'retweets', 'favorites', 'text']]

In [None]:
# Cleaning text helper function
def text_cleaning(text):
    
    # checking if text empty
    if (not re.search('[a-zA-Z0-9]', text)) or (type(text) != str) or (text == 'nan'):
        return ''
         
    text = re.sub(r'@', '', text)             # Remove @ mentions
    text = re.sub(r'#', '', text)              # Remove Hastags symbols
    text = re.sub(r'RT[\s]+', '', text)        # Remove RT mention
    text = re.sub(r'\n', ' ', text)              # Remove line terminator character
    text = re.sub(r'(pictwitter)\w+', '', text)    # Removing picture names
    text = re.sub(r'\xa0', ' ', text)               # Removing non breaking space character
    
    return text

# Presence of URL helper function
def url_detect(text):
    list_url = re.findall(r'https?:\/\/.*', text)
    return len(list_url)

In [None]:
# Analyzing URL
data['url'] = data.text.progress_apply(url_detect)

In [None]:
####################################################
likes_threshold = 5
retweets_thershold = 5
####################################################

# Looking for scam keywords
with open('Scam_keywords.txt', 'r') as file:
    content = file.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
scam_dictionary = [x.strip().lower() for x in content] 

def scam_detect(text):
    if any([re.search(f'[{string.punctuation} ]{word}[{string.punctuation} ]', text.lower()) for word in scam_dictionary]):
        return True
    if not re.search(' [a-zA-Z]+ ', text):
        return True

# Filtering data
mask1 = data.url > 0 
mask2 = data.favorites <= likes_threshold
mask3 = data.retweets <= retweets_thershold
mask4 = data.text.progress_apply(scam_detect)
ads_data = data[mask1 & mask2 & mask3 | mask4].drop('url', axis=1)
no_ads_data = data.drop(labels = ads_data.index).drop('url', axis=1)
print(f"Remaining data: {round(100 * len(no_ads_data) / len(data))} %")

In [None]:
# Cleaning Text
data.text = data.text.progress_apply(text_cleaning)

In [None]:
# # Get a look at a sample from detected ads
# pd.set_option('display.max_colwidth', None)
# for txt in ads_data.text.sample(5):
#     print(txt)
#     print()

## Saving Data

In [None]:
# ads_filename = re.sub(r'\d+\.csv', '', filename) + '_ads_' + str(len(ads_data)) + ".csv"
# ads_data.to_csv(ads_filename, index=False)

no_ads_filename = re.sub(r'\d+\.csv', '', filename) + str(len(no_ads_data)) + '_clean' + ".csv"
no_ads_filename = re.sub('Unprocessed_data', 'Processed_data', no_ads_filename)
no_ads_data.to_csv(no_ads_filename, index=False)
