In [None]:
import time
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

In [None]:
tweets = pd.read_csv('final.csv').iloc[:,1:]
tweet = tweets['Tweets']

In [None]:
stop_words = stopwords.words('english')
stop_words.remove('not')
punct_words = list(string.punctuation)
stop = stop_words + punct_words
stop = list(set(stop))

In [None]:
def get_easy_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def process_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(f'[{string.punctuation}]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop]
    words = [lemmatizer.lemmatize(word, get_easy_pos(pos_tag([word])[0][1])) for word in words]
    return ' '.join(words)

In [None]:
tweet0 = tweet[:500]
processed_tweet0 = tweet0.apply(process_text)

In [None]:
def phase_1(lines):
    n = len(lines)
    lines_thresh_bool = np.ones((n * n,), dtype=np.bool)
    for i in range(n):
        for j in range(i + 1, n):
            line1 = lines[i]
            n1 = len(line1)
            line2 = lines[j]
            n2 = len(line2)
            n_max = max(n1, n2)
            n_min = min(n1, n2)
            count_true = 0
            for k in range(n_min):
                if line1[k] == line2[k]:
                    count_true += 1
            thresh_true = round(count_true / n_max, 2)
            thresh_false = 1 - thresh_true
            if (thresh_true <= 0.25) or (0.75 <= thresh_false):
                lines_thresh_bool[(n * i) + j] = False
                lines_thresh_bool[i + (n * j)] = False
    return lines_thresh_bool.reshape(n, n)

In [None]:
sample_lines = ['This is a test', 'Another test line', 'Something different']
result = phase_1(sample_lines)

In [None]:
def phase_2(lines, lines_thresh_bool):
    tweet_lines_thresh_bool = pd.DataFrame(lines_thresh_bool.T)
    tweet_lines_thresh_bool.insert(0, 'tweet', pd.DataFrame(np.array(lines)))
    l = set()
    for i in range(len(lines)):
        d = tweet_lines_thresh_bool[tweet_lines_thresh_bool[i] == True]['tweet'].to_dict()
        for k in d:
            d[k] = len(d[k])
        d = dict(sorted(d.items(), key=lambda x: x[1], reverse=True))
        l.add(list(d.keys())[0])
    l = list(l)
    refined_tweets = tweet_lines_thresh_bool.loc[l, 'tweet'].to_list()
    return refined_tweets, l

In [None]:
lines_thresh_bool = phase_1(processed_tweet0)
refined_tweets, l = phase_2(processed_tweet0, lines_thresh_bool)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(lines_thresh_bool)