In [24]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from langdetect import detect
import string
import glob
import nltk
import math
import re
import os
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
def search(query):
    filename = 'search_result.csv'
    if os.path.isfile(filename):
        with open(filename, 'w', newline='') as f:
            f.write('')
    nltk_stopwords = nltk.corpus.stopwords.words('english')
    def preprocess(text):
        text = text.lower()
        text = "".join([char for char in text if char not in string.punctuation])
        text = re.sub('[0-9]+', '', text)
        text = text.strip()
        tokens = re.split('\W+', text)
        tokens = [token for token in tokens if token != '' and len(token) > 1]
        tokens = [token for token in tokens if token not in nltk_stopwords]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
        return tokens
    def calculate_idf(N: int, df: int) -> float:
        '''
        calculate the inverse document frequency (idf) of a term
        :param N: number of documents
        :param df: document frequency of a term (number of documents containing the term)
        '''
        if df == 0:
            return 0
        return math.log(N/df, 10)
    def get_document_frequency(term: str, documents: list) -> int:
        '''
        calculate the document frequency of a term in a list of documents
        :param term: the term
        :param documents: the list of documents
        '''
        count = 0
        for document in documents:
            if term in document:
                count += 1
        return count
    def log_term_frequency(freq: int) -> float:
        '''
        calculate the log term frequency of a term in a document
        :param freq: the term frequency
        '''
        if freq == 0:
            return 0
        return 1 + math.log(freq, 10)
    def preprocess_documents(df):
        document_vectors = []
        for document in df['Content_preprocessed']:
            #Calculating the term frequency
            term_frequency = {}
            for term in document:
                if term in term_frequency:
                    term_frequency[term] += 1
                else:
                    term_frequency[term] = 1
            #Calculating the log term frequency 1.a
            for term in term_frequency:
                term_frequency[term] = log_term_frequency(term_frequency[term])
            #Calculating the document frequency 1.b, but not used because document frequency is not part of the weighting scheme
            #Taking cosine normalization 1.c
            sum_of_squares = sum([freq**2 for freq in term_frequency.values()])
            sq_sum_of_squares = math.sqrt(sum_of_squares)
            for term in term_frequency:
                term_frequency[term] /= sq_sum_of_squares

            document_vectors.append(term_frequency)

        return document_vectors
    def preprocess_query(query: str) -> list:
        '''
        preprocess the query
        :param query: the query
        '''

        #Preprocessing the query
        query = preprocess(query)
        #Calculating the term frequency of the query
        query_vector = {}
        for term in query:
            if term in query_vector:
                query_vector[term] += 1
            else:
                query_vector[term] = 1
        #Calculating the inverse document frequency of the query terms 2.b
        idf = {}
        for term in query_vector:
            idf[term] = calculate_idf(len(df), get_document_frequency(term, df['Content_preprocessed']))
        #Calculating the log term frequency of the query terms 2.a
        for term in query_vector:
            query_vector[term] = log_term_frequency(query_vector[term])
        #Multiplying the term frequency with the inverse document frequency 2.b
        for term in query_vector:
            query_vector[term] = query_vector[term] * idf[term]

        #Normalizing the query vector with cosine normalization 2.c
        sum_of_squares = sum([freq**2 for freq in query_vector.values()])
        sq_sum_of_squares = math.sqrt(sum_of_squares)
        if sq_sum_of_squares == 0:
            return query_vector
        for term in query_vector:
            query_vector[term] /= sq_sum_of_squares

        return query_vector
    def calculate_cosine_similarity(query_vector: dict, document_vector: dict) -> float:
        '''
        calculate the cosine similarity between a query and a document
        :param query_vector: the query vector
        :param document_vector: the document vector
        '''
        numerator = 0
        for term in query_vector:
            if term in document_vector:
                numerator += query_vector[term] * document_vector[term]
        return numerator
    def get_top_k_documents(query: str, k: int) -> list:
        '''
        get the top k documents for a query
        :param query: the query
        :param k: number of documents to return
        '''
        output=[]
        query_vector = preprocess_query(query)
        scores = []
        for document_vector in dc_vectors:
            scores.append(calculate_cosine_similarity(query_vector, document_vector))
        output=scores
        print(scores)
        #Printing the top k documents with their titles and scores
        #Getting the indices of the top k documentslo
        #tweet-content,twewt-likes,tweet-retweets,tweet-content.username,tweet url,followers,category
        top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
        for i in range(k):
            print('\n' + str(i+1) + ')\nTitle: ', df.iloc[top_k_indices[i]]['Tweet'])
            print('Score: ', scores[top_k_indices[i]], end="\n---------------XXX---------------")
        return output
    v={35: 'U.S. NEWS', 5: 'COMEDY', 22: 'PARENTING', 40: 'WORLD NEWS', 7: 'CULTURE & ARTS', 32: 'TECH', 28: 'SPORTS', 10: 'ENTERTAINMENT', 24: 'POLITICS', 37: 'WEIRD NEWS', 11: 'ENVIRONMENT', 9: 'EDUCATION', 6: 'CRIME', 27: 'SCIENCE', 38: 'WELLNESS', 3: 'BUSINESS', 30: 'STYLE & BEAUTY', 13: 'FOOD & DRINK', 20: 'MEDIA', 25: 'QUEER VOICES', 17: 'HOME & LIVING', 39: 'WOMEN', 2: 'BLACK VOICES', 34: 'TRAVEL', 21: 'MONEY', 26: 'RELIGION', 19: 'LATINO VOICES', 18: 'IMPACT', 36: 'WEDDINGS', 4: 'COLLEGE', 23: 'PARENTS', 1: 'ARTS & CULTURE', 29: 'STYLE', 15: 'GREEN', 31: 'TASTE', 16: 'HEALTHY LIVING', 33: 'THE WORLDPOST', 14: 'GOOD NEWS', 41: 'WORLDPOST', 12: 'FIFTY', 0: 'ARTS', 8: 'DIVORCE'}
    limit = 1000
    tweets = []
    with open('classify_model.pkl', 'rb') as f:
        category_model = pickle.load(f)

    with open('spam_model.pkl', 'rb') as f:
        spam_model = pickle.load(f)

    with open('vectorizer.pkl', 'rb') as file:
        feature_extraction = pickle.load(file)
    for tweet in sntwitter.TwitterSearchScraper(query).get_items():
        if len(tweets) == limit:
            break
        else:
            if detect(tweet.content)=='en':
                spam_prediction = spam_model.predict(feature_extraction.transform([tweet.content]))
                if spam_prediction[0] == 1:
                    spam_pre='Ham'
                else:
                    spam_pre='Spam'
                category=v[category_model.predict([tweet.content])[0]]
                # followers_weight = 2
                # likes_weight = 4
                # retweets_weight = 3
                weight_score=(tweet.likeCount*2/1000)+(tweet.retweetCount*3/100)+(tweet.user.followersCount*4/10000)/100
                tweets.append([tweet.date, tweet.username, tweet.content,tweet.url, tweet.user.followersCount, tweet.likeCount, tweet.retweetCount,category,spam_pre,weight_score])

    df = pd.DataFrame(tweets, columns=['Date', 'User', 'Tweet','Url', 'Followers', 'Likes', 'Retweets','Category','Spam/Ham','weighted_score'])
    df['Content_preprocessed'] = df['Tweet'].apply(preprocess)
    dc_vectors = preprocess_documents(df)
    output=get_top_k_documents(query, len(df))
    df['Scores'] = output
    df['final_score']=output+df['weighted_score']
    df = df.sort_values(by='final_score', ascending=False)
    df['rank'] = range(1, len(df) + 1)
    df.to_csv('search_result.csv', mode='a', index=False, header=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
search('ferrari is a  sports car manufacturer')

  if detect(tweet.content)=='en':
  spam_prediction = spam_model.predict(feature_extraction.transform([tweet.content]))
  category=v[category_model.predict([tweet.content])[0]]
  tweets.append([tweet.date, tweet.username, tweet.content,tweet.url, tweet.user.followersCount, tweet.likeCount, tweet.retweetCount,category,spam_pre,weight_score])
  tweets.append([tweet.date, tweet.username, tweet.content,tweet.url, tweet.user.followersCount, tweet.likeCount, tweet.retweetCount,category,spam_pre,weight_score])


[0.3036395942747475, 0.39326288098976997, 0.36080941107706227, 0.3862283991301262, 0.39372075140356544, 0.4758282994630517, 0.41191687726492054, 0.4263987851980787, 0.41191687726492054, 0.4396239399142827, 0.44288505932672606, 0.4122076444777083, 0.4122076444777083, 0.44288505932672606, 0.3342721514256357, 0.37633231115592247, 0.3364614172307871, 0.3364614172307871, 0.3868213912176146, 0.3337380038140885, 0.3833747783204413, 0.3415579488478029, 0.3342721514256357, 0.5048897386113761, 0.3982394326894463, 0.3364614172307871, 0.3364614172307871, 0.40649810969268974, 0.37633231115592247, 0.36080941107706227, 0.3576835122067935, 0.3576835122067935, 0.4396239399142827, 0.42069664344022084, 0.44288505932672606, 0.44288505932672606, 0.4244795637448086, 0.392901275149115, 0.5238111207475339, 0.3608094110770623, 0.44288505932672606, 0.44288505932672606, 0.3503563096986808, 0.4122076444777083, 0.35905052988323627, 0.392901275149115, 0.4031751701414, 0.3989269513128108, 0.46512025714440774, 0.4758