## Processing text to find similarities

In [1]:
import pandas as pd
import numpy as np

#for sql handling
import psycopg2
import sql
from sql import engine
from sql import get_data

# check text matching
import Levenshtein                                              # install: with pip install Levenshtein
import string
from sklearn.metrics.pairwise import cosine_similarity          # install: conda install sklearn
from sklearn.feature_extraction.text import CountVectorizer
import nltk                                                     # install: conda install -c anaconda nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

Python-dotenv could not parse statement starting at line 1
Python-dotenv could not parse statement starting at line 2


.env file found and working


## Bring all the titles from news table & create a function to check each one

In [11]:
news = get_data('SELECT * FROM  news_clean')



In [12]:
# cleaning text function 

def clean_titles(df):
    clean_title = pd.Series([], dtype=pd.StringDtype())
    
    for row in range(len(df)):
        text = ''.join([word for word in df['title_en'][row] if word not in string.punctuation])
        text = text.lower()
        text = ' '.join([word for word in df['title_en'][row].split() if word not in stopwords])
        clean_title[row] = text
        
    df.insert(1, 'clean_title', clean_title)
    return df

**Dataframe with clean titles**

In [13]:
news = clean_titles(news)

**Create vectors from 2 titles**

In [14]:
# functions to create vectors from two titles

def vectorize_function(title1, title2):
    title_list = [title1, title2]
    vectorizer = CountVectorizer().fit_transform(title_list)
    vectors = vectorizer.toarray()
    return vectors

In [15]:
# function to calculate the cosine similarity matrix of the two titles

def cosine_matrix_function(vectors):
    csim_titles = cosine_similarity(vectors)
    return csim_titles 
    

In [16]:
#function to reshape

def cosine_similarity_vectors_function(vec1, vec2):
    vec1 = vec1.reshape(1,-1)
    vec2 = vec2.reshape(1,-2)
    return cosine_similarity(vec1, vec2)[0][0]

In [17]:
# function to know similarity of titles

def check_similarity(title1, title2):
    
    result = ''
    
    #create vectors from two titles
    vectors = vectorize_function(title1, title2)
    
    #calculate cosine from vectors
    similarity = cosine_similarity_vectors_function(vectors[0], vectors[1])
    
    # check for result
    if similarity == 0.0:
        result = False
    elif similarity > 0.7:
        result = True
    else:
        result = 'inconclusive'
    
    return result    

## Function to integrate all functions

In [18]:
#function to filter dataframe for similar news

def filter_news(df):

    df1 = df
    df2 = df.copy()

    for index, row in df1.iterrows():

        for index2, row2 in df2.iterrows():

            title1 = row['clean_title']
            title2 = row2['clean_title']

            if index != index2:
                
                xyz = check_similarity(title1, title2)

                if xyz == True:
                    print('')
                    print('-----------')
                    print(title1)
                    print(title2)
                    print('-----------')
                    df2.drop([index2], inplace=True)
        
    return df2    

In [19]:
df = filter_news(news)

In [20]:
df.info()

In [21]:
#create table, define schema and upload to SQL

table_name = 'news_unique'
schema = 'capstone'

if engine!=None:
    try:
        df.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # change to 'append' 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None