In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
df = pd.read_csv('C:/Users/ishaan phaye/Desktop/Data files/news.csv')

In [None]:
df.content=df.content.astype(str)
df['Original Content']=df['content']

In [None]:
df['content'] = df['content'].replace('\n', ' ')
df['content'] = df['content'].str.lower()

In [None]:
df['word_count'] = df['content'].apply(lambda x: len(x.split()))
df['sentence_count'] = df['content'].apply(lambda x: len(x.split('. ')))
df['avg_sentence_length'] = df['word_count'] / df['sentence_count']
df['avg_word_length'] = df['content'].apply(lambda x: np.mean([len(word) for word in x.split()]))
df['Original Metrics'] = df.apply(lambda x: f"Word Count: {x['word_count']}, Sentence Count: {x['sentence_count']}, Avg Sentence Length: {x['avg_sentence_length']:.2f}, Avg Word Length: {x['avg_word_length']:.2f}", axis=1)

In [None]:
# test-train split of 10-90
train_set = df.sample(frac=0.9, random_state=42)
test_set = df.drop(train_set.index)

In [None]:
train_set['content'] = train_set['content'].fillna('')

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(train_set['content'])
similarity_matrix = cosine_similarity(tfidf_matrix)

In [None]:
cleaned_responses = []
for index, row in test_set.iterrows():
    sentence_scores = {}
    sentences = str(row['content']).split('. ')
    for i, sentence in enumerate(sentences):
        sentence_scores[i] = 0
        for j, similarity_score in enumerate(similarity_matrix[i]):
            if i == j:
                continue
            sentence_scores[i] += similarity_score
    top_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:int(len(sentences) * 0.3)]
    top_sentences = sorted(top_sentences, key=lambda x: x[0])
    cleaned_sentences = []
    removed_lines = []
    for i, sentence in enumerate(sentences):
        if i in [index for index, score in top_sentences]:
            cleaned_sentences.append(sentence)
        else:
            removed_lines.append(sentence)
    cleaned_responses.append('. '.join(cleaned_sentences))
    test_set.at[index, 'Removed Lines'] = '. '.join(removed_lines)

In [None]:
test_set['New Content'] = cleaned_responses

In [None]:
# Additional metrics for reduced data
test_set['word_count'] = test_set['New Content'].apply(lambda x: len(x.split()))
test_set['sentence_count'] = test_set['New Content'].apply(lambda x: len(x.split('. ')))
test_set['avg_sentence_length'] = test_set['word_count'] / test_set['sentence_count']
test_set['avg_word_length'] = test_set['New Content'].apply(lambda x: np.mean([len(word) for word in x.split()]))
test_set['New Metrics'] = test_set.apply(lambda x: f"Word Count: {x['word_count']}, Sentence Count: {x['sentence_count']}, Avg Sentence Length: {x['avg_sentence_length']:.2f}, Avg Word Length: {x['avg_word_length']:.2f}", axis=1)

In [None]:
test_set = test_set[['Original Content','Original Metrics', 'New Content', 'New Metrics','Removed Lines']]

In [None]:
test_set.head()

In [None]:
# # Run this to output the test_set.csv file
# test_df = pd.DataFrame(test_set)
# test_df.to_csv('test_cleaned_responses.csv', index=False)