In [148]:
import pandas as pd

In [149]:
df=pd.read_csv('C:\\Users\\hinaa\\Downloads\\bbc_news.csv')

In [150]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [151]:
df.isnull().sum()

ArticleId    0
Text         0
Category     0
dtype: int64

In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


# Text Preprocessing

# Lower Casing

In [153]:
df['Text'].str.lower()

0       worldcom ex-boss launches defence lawyers defe...
1       german business confidence slides german busin...
2       bbc poll indicates economic gloom citizens in ...
3       lifestyle  governs mobile choice  faster  bett...
4       enron bosses in $168m payout eighteen former e...
                              ...                        
1485    double eviction from big brother model caprice...
1486    dj double act revamp chart show dj duo jk and ...
1487    weak dollar hits reuters revenues at media gro...
1488    apple ipod family expands market apple has exp...
1489    santy worm makes unwelcome visit thousands of ...
Name: Text, Length: 1490, dtype: object

# Punctuation Removal

In [154]:
import string

def punct_removel(text):
    trans=str.maketrans('','',string.punctuation)
    return text.translate(trans)

In [155]:
df['Text']=df['Text'].apply(punct_removel)

In [156]:
df['Text']

0       worldcom exboss launches defence lawyers defen...
1       german business confidence slides german busin...
2       bbc poll indicates economic gloom citizens in ...
3       lifestyle  governs mobile choice  faster  bett...
4       enron bosses in 168m payout eighteen former en...
                              ...                        
1485    double eviction from big brother model caprice...
1486    dj double act revamp chart show dj duo jk and ...
1487    weak dollar hits reuters revenues at media gro...
1488    apple ipod family expands market apple has exp...
1489    santy worm makes unwelcome visit thousands of ...
Name: Text, Length: 1490, dtype: object

# Stopword

In [157]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [158]:
df['Text'] = df['Text'].apply(remove_stopwords)

In [159]:
df['Text']

0       worldcom exboss launches defence lawyers defen...
1       german business confidence slides german busin...
2       bbc poll indicates economic gloom citizens maj...
3       lifestyle governs mobile choice faster better ...
4       enron bosses 168m payout eighteen former enron...
                              ...                        
1485    double eviction big brother model caprice holb...
1486    dj double act revamp chart show dj duo jk joel...
1487    weak dollar hits reuters revenues media group ...
1488    apple ipod family expands market apple expande...
1489    santy worm makes unwelcome visit thousands web...
Name: Text, Length: 1490, dtype: object

# Tokenization

In [160]:
import nltk
from nltk.tokenize import word_tokenize

In [161]:
df['tokenized_text'] = df['Text'].apply(word_tokenize)

In [162]:
df['tokenized_text']

0       [worldcom, exboss, launches, defence, lawyers,...
1       [german, business, confidence, slides, german,...
2       [bbc, poll, indicates, economic, gloom, citize...
3       [lifestyle, governs, mobile, choice, faster, b...
4       [enron, bosses, 168m, payout, eighteen, former...
                              ...                        
1485    [double, eviction, big, brother, model, capric...
1486    [dj, double, act, revamp, chart, show, dj, duo...
1487    [weak, dollar, hits, reuters, revenues, media,...
1488    [apple, ipod, family, expands, market, apple, ...
1489    [santy, worm, makes, unwelcome, visit, thousan...
Name: tokenized_text, Length: 1490, dtype: object

# Stemming

In [163]:
from nltk.stem import PorterStemmer

In [164]:
def stem_tokens(tokens):
    ps = PorterStemmer()
    stemmed_words = [ps.stem(word) for word in tokens]
    return stemmed_words

In [165]:
stemmed_text = df['tokenized_text'].apply( stem_tokens)

In [166]:
stemmed_text 

0       [worldcom, exboss, launch, defenc, lawyer, def...
1       [german, busi, confid, slide, german, busi, co...
2       [bbc, poll, indic, econom, gloom, citizen, maj...
3       [lifestyl, govern, mobil, choic, faster, bette...
4       [enron, boss, 168m, payout, eighteen, former, ...
                              ...                        
1485    [doubl, evict, big, brother, model, capric, ho...
1486    [dj, doubl, act, revamp, chart, show, dj, duo,...
1487    [weak, dollar, hit, reuter, revenu, media, gro...
1488    [appl, ipod, famili, expand, market, appl, exp...
1489    [santi, worm, make, unwelcom, visit, thousand,...
Name: tokenized_text, Length: 1490, dtype: object

# Lemmantization

In [167]:
from nltk.stem import WordNetLemmatizer

In [168]:
lemmatizer = WordNetLemmatizer()

def lemmatize_stemmed_text(stemmed_words):
    if not isinstance(stemmed_words, list) or not all(isinstance(word, str) for word in stemmed_words):
        raise ValueError("Input must be a list of strings.")
    
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    return ' '.join(lemmatized_words)

In [169]:
# Apply the lemmatization function to each row
df['lemmatized_text'] = stemmed_text .apply(lemmatize_stemmed_text)

In [170]:
df['lemmatized_text']

0       worldcom exboss launch defenc lawyer defend fo...
1       german busi confid slide german busi confid fe...
2       bbc poll indic econom gloom citizen major nati...
3       lifestyl govern mobil choic faster better funk...
4       enron bos 168m payout eighteen former enron di...
                              ...                        
1485    doubl evict big brother model capric holbi cit...
1486    dj doubl act revamp chart show dj duo jk joel ...
1487    weak dollar hit reuter revenu medium group reu...
1488    appl ipod famili expand market appl expand ipo...
1489    santi worm make unwelcom visit thousand websit...
Name: lemmatized_text, Length: 1490, dtype: object

# Count Vectorizer

In [171]:
from sklearn.feature_extraction.text import CountVectorizer

In [172]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the lemmatized text data
X = vectorizer.fit_transform(df['lemmatized_text'])

# Convert the sparse matrix to a DataFrame
vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display the DataFrame
print(vectorized_df)

      00  000  0001  00051  000acr  000ayear  000bn  000m  000seater  \
0      0    0     0      0       0         0      0     0          0   
1      0    0     0      0       0         0      0     0          0   
2      0    1     0      0       0         0      0     0          0   
3      0    1     0      0       0         0      0     0          0   
4      0    0     0      0       0         0      0     0          0   
...   ..  ...   ...    ...     ...       ...    ...   ...        ...   
1485   0    1     0      0       0         0      0     0          0   
1486   0    0     0      0       0         0      0     0          0   
1487   0    0     0      0       0         0      0     0          0   
1488   0    0     0      0       0         0      0     0          0   
1489   0    1     0      0       0         0      0     0          0   

      000strong  ...  zombi  zone  zonealarm  zoom  zooropa  zorro  zuluaga  \
0             0  ...      0     0          0     0      

In [173]:
def create_ngram_models(text_data, ngram_range):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(text_data)
    return X, vectorizer

# Unigrams
X_unigrams, vectorizer_unigrams = create_ngram_models(df['lemmatized_text'], (1, 1))

# Bigrams
X_bigrams, vectorizer_bigrams = create_ngram_models(df['lemmatized_text'], (2, 2))

# Trigrams
X_trigrams, vectorizer_trigrams = create_ngram_models(df['lemmatized_text'], (3, 3))


# Creating Model

In [182]:
y=df['Category']

In [183]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [184]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

# Prediction

In [185]:
y_pred = model.predict(X_test)

In [189]:
from sklearn.metrics import accuracy_score

In [190]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9765
