In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('topics.csv', on_bad_lines='skip')

In [3]:
df.shape

(18775, 2)

In [4]:
df.head(10)

Unnamed: 0,text,label
0,"Y'all lighten up on Harry, Skip'll be like tha...",rec.sport.baseball
1,\nIt depends on the cause of the pneumonia. F...,sci.med
2,\nThere is a wonderful book by Jean Meeus call...,sci.space
3,"\n\n\nNo, you have completely misunderstood. ...",talk.politics.mideast
4,\nThat's not true. I gave you two examples....,alt.atheism
5,\nAbout a month ago there was a photo posted o...,sci.electronics
6,Thanks to all those people who recommended Wor...,comp.os.ms-windows.misc
7,\nYou are making precisely one of the points I...,sci.med
8,\n\tDoes anyone have details on this? What s...,comp.sys.mac.hardware
9,"While there are too many PS clones to count, s...",comp.os.ms-windows.misc


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18775 entries, 0 to 18774
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    18396 non-null  object
 1   label   18748 non-null  object
dtypes: object(2)
memory usage: 293.5+ KB


In [6]:
df=df.dropna(how='any')
df=df.drop_duplicates()

In [7]:
df['label'].value_counts()

label
soc.religion.christian                                                                                                                                                                                                                                                                                                                                                                                                                                 978
rec.sport.hockey                                                                                                                                                                                                                                                                                                                                                                                                                                       978
rec.motorcycles                                                                                             

In [8]:
df=df[df['label'].str.startswith(('soc.','rec.','comp.','sci.','misc.','talk.','alt.'))]

In [9]:
df['label'].value_counts()

label
rec.sport.hockey            978
soc.religion.christian      978
rec.motorcycles             971
comp.sys.ibm.pc.hardware    967
sci.crypt                   964
rec.sport.baseball          962
sci.med                     962
comp.windows.x              958
sci.space                   955
comp.graphics               954
sci.electronics             954
comp.os.ms-windows.misc     949
misc.forsale                945
rec.autos                   937
comp.sys.mac.hardware       931
talk.politics.mideast       911
talk.politics.guns          887
alt.atheism                 769
talk.politics.misc          711
talk.religion.misc          598
Name: count, dtype: int64

In [10]:
df.shape

(18241, 2)

In [11]:
df.nunique()

text     18176
label       20
dtype: int64

In [12]:
df=df.drop_duplicates(subset='text')

In [13]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
def clean_text(text):
    text=re.sub(r'(?im)^(from|date|message-id|sender|organization|expires|reply-to|distribution|keywords|newsgroups|references|lines)[:\-].*$','',text)
    text=re.sub(r'\S+@\S+',' ',text)
    text= re.sub(r'[^a-zA-Z]'," ",text)
    text=text.lower()
    text=re.sub(r'\s+'," ",text)
    text=' '.join([word for word in text.split() if word not in stop_words])

    return text
    

In [15]:
words_count_before_cleaning=df['text'].str.split().apply(len).sum()
print('Number of words before cleaning:',words_count_before_cleaning)
df['text']=df['text'].apply(clean_text)
df=df[df['text'].str.strip() != '']
df.to_csv('df.csv', index=False)
words_count_after_cleaning=df['text'].str.split().apply(len).sum()
print("Number of words after cleaning:",words_count_after_cleaning)

Number of words before cleaning: 3376036
Number of words after cleaning: 1931748


In [16]:
df.shape

(18127, 2)

#### Logistic regression

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [20]:
X_train, X_test, y_train, y_test=train_test_split(df['text'],df['label'],test_size=0.2,random_state=42,stratify=df['label'])

In [21]:
vectorizer=TfidfVectorizer()
X_train_vec=vectorizer.fit_transform(X_train)
X_test_vec=vectorizer.transform(X_test)

In [22]:
model=LogisticRegression(max_iter=2000)
model.fit(X_train_vec,y_train)

In [23]:
y_pred=model.predict(X_test_vec)
print(classification_report(y_test,y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.63      0.61      0.62       153
           comp.graphics       0.69      0.78      0.73       190
 comp.os.ms-windows.misc       0.77      0.68      0.72       189
comp.sys.ibm.pc.hardware       0.73      0.76      0.74       192
   comp.sys.mac.hardware       0.85      0.75      0.80       185
          comp.windows.x       0.84      0.84      0.84       191
            misc.forsale       0.85      0.82      0.84       188
               rec.autos       0.78      0.77      0.78       186
         rec.motorcycles       0.70      0.82      0.75       193
      rec.sport.baseball       0.88      0.88      0.88       190
        rec.sport.hockey       0.95      0.89      0.92       194
               sci.crypt       0.88      0.81      0.84       192
         sci.electronics       0.66      0.78      0.71       190
                 sci.med       0.80      0.90      0.84       191
         

#### ANN:

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

In [25]:
le= LabelEncoder()
df['encoded_label']=le.fit_transform(df['label'])

In [26]:
tokenizer= Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['text'])
sequences= tokenizer.texts_to_sequences(df['text'])
X=pad_sequences(sequences,maxlen=100)
y=to_categorical(df['encoded_label'])

In [27]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val= train_test_split(X_train,y_train,test_size=0.1,random_state=42)

In [28]:
model=Sequential()
model.add(Embedding(input_dim=10000,output_dim=128,input_length=100))
model.add(Flatten())
model.add(Dense(64,activation='relu'))
model.add(Dense(y.shape[1],activation='softmax'))
model.build(input_shape=(None, 100))



In [29]:
model.summary()

In [30]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [31]:
model.fit(
    X_train,y_train,
    validation_data=(X_val,y_val),
    epochs=10,
    batch_size=32,
    verbose=2)

Epoch 1/10
408/408 - 12s - 30ms/step - accuracy: 0.2827 - loss: 2.3071 - val_accuracy: 0.5879 - val_loss: 1.4310
Epoch 2/10
408/408 - 10s - 25ms/step - accuracy: 0.7921 - loss: 0.7518 - val_accuracy: 0.6099 - val_loss: 1.2804
Epoch 3/10
408/408 - 11s - 28ms/step - accuracy: 0.9485 - loss: 0.2293 - val_accuracy: 0.6216 - val_loss: 1.2885
Epoch 4/10
408/408 - 11s - 26ms/step - accuracy: 0.9849 - loss: 0.0877 - val_accuracy: 0.6237 - val_loss: 1.4010
Epoch 5/10
408/408 - 11s - 26ms/step - accuracy: 0.9920 - loss: 0.0430 - val_accuracy: 0.5996 - val_loss: 1.5216
Epoch 6/10
408/408 - 21s - 51ms/step - accuracy: 0.9960 - loss: 0.0253 - val_accuracy: 0.5899 - val_loss: 1.6487
Epoch 7/10
408/408 - 11s - 26ms/step - accuracy: 0.9972 - loss: 0.0175 - val_accuracy: 0.5810 - val_loss: 1.7648
Epoch 8/10
408/408 - 9s - 22ms/step - accuracy: 0.9972 - loss: 0.0132 - val_accuracy: 0.5713 - val_loss: 1.8360
Epoch 9/10
408/408 - 11s - 27ms/step - accuracy: 0.9979 - loss: 0.0114 - val_accuracy: 0.5596 - v

<keras.src.callbacks.history.History at 0x1b798685a00>

In [32]:
loss,accuracy=model.evaluate(X_test,y_test)
print(f'Test accuracy:{accuracy}')

[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5598 - loss: 2.0620
Test accuracy:0.5568119287490845
