In [None]:
import pickle
import pandas as pd
import itertools
from collections import Counter
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.linear_model import LogisticRegression
import os
import string

In [None]:
# A function used to build a vocabulary based on descending word frequencies 
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = vocabulary_inv[5:]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [None]:
# A function used to learn word embeddings through Word2vec module
def get_embeddings(inp_data, vocabulary_inv, size_features=100,
                   mode='skipgram',
                   min_word_count=2,
                   context=5):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 15  # Number of threads to run in parallel
    downsampling = 1e-3  # Downsample setting for frequent words
    print('Training Word2Vec model...')
    # use inp_data and vocabulary_inv to reconstruct sentences
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == 'skipgram':
        sg = 1
        print('Model: skip-gram')
    elif mode == 'cbow':
        sg = 0
        print('Model: CBOW')
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        sg=sg,
                                        vector_size=size_features,
                                        min_count=min_word_count,
                                        window=context,
                                        sample=downsampling)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model.wv:
            embedding_weights[i] = embedding_model.wv[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25,
                                                     embedding_model.vector_size)
    return embedding_weights

In [None]:
def preprocess_df(df):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["tweets"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["tweets"] = preprocessed_sentences
    return df

In [None]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
stop_words.update(['The', 'chatgpt', 'openAI', 'https', 'nhttps', 'co'])

def build_vocab(data):
    word_counts = {}
    for sentence in data:
        for word in sentence:
            if word not in stop_words and word not in string.punctuation:
                if word not in word_counts:
                    word_counts[word] = 1
                else:
                    word_counts[word] += 1

    vocabulary_inv = [x[0] for x in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)]
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}

    return word_counts, vocabulary, vocabulary_inv

In [None]:
data_path = "./"

df_train = pd.read_csv(data_path + "file.csv")
df_test = pd.read_csv(data_path + "file.csv")

df_train["tweets"] = df_train["labels"]+ "" + df_train["tweets"]
df_test["tweets"] = df_test["labels"]+ ""+ df_test["tweets"]
df_train = preprocess_df(df_train)
df_test = preprocess_df(df_test)


def preprocess_data(tagged_data):
    special_characters = ['Chatgtp','chatgpt', 'https', 're', 'nhttps', 's',]
    filtered_data = [[word for word in text if word not in special_characters] for text in tagged_data]
    return filtered_data



# tokenization 
tagged_data = [word_tokenize(_d) for i, _d in enumerate(df_train["tweets"])]
# build vocabulary from tokenized data
word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_data)
# use the above mapping to create input data
inp_data = [[vocabulary[word] for word in text if (word!='chatgpt' and word!='Chatgtp'and word!='https'and word!='re' and word!='chatgpt' word!='nhttps'and word!='s')] for text in tagged_data ]
# get embedding vector
embedding_weights = get_embeddings(inp_data, vocabulary_inv)


tagged_train_data = [word_tokenize(_d) for i, _d in enumerate(df_train["tweets"])]
tagged_test_data = [word_tokenize(_d) for i, _d in enumerate(df_test["tweets"])]


In [None]:
train_vec = []
for doc in tagged_train_data:
    vec = 0
    for w in doc:
        if w!='The' and w!='chatgpt' and w!='openAI' and w!='https' and w!='nhttps':
            vec += embedding_weights[vocabulary[w]]
    vec = vec / len(doc)
    train_vec.append(vec)

test_vec = []
for doc in tagged_test_data:
    vec = 0
    length = 0
    for w in doc:
        try:
            if w!='The' and w!='chatgpt' and w!='openAI' and w!='https' and w!='nhttps':
                vec += embedding_weights[vocabulary[w]]
                length += 1
        except:
            continue
    vec = vec / length
    test_vec.append(vec)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
clf = LogisticRegression(max_iter=10000)
pipe = Pipeline([('classifier', LogisticRegression())])
param_grid = [{'classifier': [LogisticRegression(max_iter=10000)],
              'classifier__penalty': ['l1', 'l2'],
              'classifier__C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 15, 20],
              'classifier__solver': ['liblinear']}]
clf = GridSearchCV(pipe, param_grid = param_grid, cv = 10, verbose = 1 , n_jobs = -1)
#best = clf.fit(X_train, y_train)
best = clf.fit(train_vec, df_train["labels"])

In [None]:
# Evaluate the best model on the test data
from sklearn.metrics import classification_report

# Get the best performing model
best_model = clf.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(test_vec)

# Print classification report
print(classification_report(df_test["labels"], y_pred))

SVC model code run without results

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

clf = SVC()
pipe = Pipeline([('classifier', SVC())])
param_grid = [{'classifier': [SVC()],
              'classifier__kernel': ['linear', 'rbf'],
              'classifier__C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 15, 20]}]
clf = GridSearchCV(pipe, param_grid=param_grid, cv=10, verbose=1, n_jobs=-1)
best = clf.fit(train_vec, df_train["labels"])


In [None]:
# Evaluate the performance of the model
from sklearn.metrics import classification_report

# Predict the labels of the testing data
y_pred = clf.predict(test_vec)

# Print the classification report
print(classification_report(df_test["labels"], y_pred))

