# Clean unwanted words from news

In [4]:
import pandas as pd
import numpy as np

#for sql handling
import psycopg2
import sql
from sql import engine
from sql import get_data

# check text matching
import Levenshtein                                              # install: with pip install Levenshtein
import string
from sklearn.metrics.pairwise import cosine_similarity          # install: conda install sklearn
from sklearn.feature_extraction.text import CountVectorizer
import nltk                                                     # install: conda install -c anaconda nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [5]:
# cleaning text function 

def clean_titles(df):
    clean_title = pd.Series([], dtype=pd.StringDtype())
    
    for row in range(len(df)):
        text = ''.join([word for word in df['title_en'][row] if word not in string.punctuation])
        text = text.lower()
        text = ' '.join([word for word in df['title_en'][row].split() if word not in stopwords])
        clean_title[row] = text
        
    df.insert(1, 'clean_title', clean_title)
    return df

In [6]:
df = get_data('SELECT * FROM table')



In [9]:
#function to clean UNWANTED WORDS from dataframe

unwanted_words = ['autos', 'luxury', 'ferrari', 'lamborghini', 'tennis', 'futbol', 'football', 'motor', 'suv', 'pickup', 'christies', 'rollex', 'rolex', 'bentley', 'snob', 'skoda', 'mitsubishi', 'motor vehicle', 'motor vehicles', 'hollywood', '4x4', 'electric rolls royce', 'formula one', 'phantom', 'fantom', 'aston martin', 'porsche', 'bmw', 'mercedes benz', 'expensive', 'toyota', 'kardashians', 'kardashian', 'league', 'cars', 'britney spears']

def clean_unwanted_words(df):
        
    for idx, row in df.iterrows():
        
        # make title and body lower
        lower_title = row['title_en'].lower()
        lower_body = row['body_en'].lower()
        
        for word in unwanted_words:
            
            # look for the word in title and body
            index_body = lower_body.find(word)
            index_title = lower_title.find(word)
            
            # check if word is in title AND OR BODY and drop row:
            if index_title != -1:
                print(row['title_en'], '-----> unwanted word found in TITLE: ', word)
                df.drop([idx], inplace=True)
                break
            elif index_body != -1:
                print(row['title_en'], '-----> unwanted word found in BODY: ', word)
                df.drop([idx], inplace=True)
                break
    return df

In [10]:
df = clean_unwanted_words(df)

In [11]:
df.info()

In [12]:
#create table, define schema and upload to SQL

table_name = 'news_clean'
schema = 'capstone'

if engine!=None:
    try:
        df.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # change to 'append' 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None