In [1]:
# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

# Convert text data to feature vectors using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6835543766578249

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.46      0.52       151
           1       0.58      0.63      0.61       202
           2       0.61      0.63      0.62       195
           3       0.54      0.69      0.61       183
           4       0.76      0.62      0.69       205
           5       0.81      0.79      0.80       215
           6       0.75      0.69      0.72       193
           7       0.70      0.68      0.69       196
           8       0.42      0.73      0.53       168
           9       0.83      0.79      0.81       211
          10       0.90      0.87      0.88       198
          11       0.77      0.75      0.76       201
          12       0.73      0.57      0.64       202
          13       0.81      0.80      0.81       194
          14       0.76      0.75      0.76       189
          15       0.53      0.86      0.65       202
          16       0.67    

In [3]:
# Function to read CoNLL 2003 dataset
def read_conll_data(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        sentence = []
        for line in file:
            line = line.strip()
            if line == '':
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                tokens = line.split()
                word, pos, chunk, label = tokens[0], tokens[1], tokens[2], tokens[3]
                sentence.append({'word': word, 'pos': pos, 'chunk': chunk, 'label': label})
    return sentences

# Example usage
conll_file_path = "C:\\Users\\hp\\Documents\\train.txt"  # Replace with the actual path to your CoNLL 2003 dataset file
dataset = read_conll_data(conll_file_path)

# Displaying the first few sentences
for i in range(3):
    print(dataset[i])
    print()

[{'word': '-DOCSTART-', 'pos': '-X-', 'chunk': '-X-', 'label': 'O'}]

[{'word': 'EU', 'pos': 'NNP', 'chunk': 'B-NP', 'label': 'B-ORG'}, {'word': 'rejects', 'pos': 'VBZ', 'chunk': 'B-VP', 'label': 'O'}, {'word': 'German', 'pos': 'JJ', 'chunk': 'B-NP', 'label': 'B-MISC'}, {'word': 'call', 'pos': 'NN', 'chunk': 'I-NP', 'label': 'O'}, {'word': 'to', 'pos': 'TO', 'chunk': 'B-VP', 'label': 'O'}, {'word': 'boycott', 'pos': 'VB', 'chunk': 'I-VP', 'label': 'O'}, {'word': 'British', 'pos': 'JJ', 'chunk': 'B-NP', 'label': 'B-MISC'}, {'word': 'lamb', 'pos': 'NN', 'chunk': 'I-NP', 'label': 'O'}, {'word': '.', 'pos': '.', 'chunk': 'O', 'label': 'O'}]

[{'word': 'Peter', 'pos': 'NNP', 'chunk': 'B-NP', 'label': 'B-PER'}, {'word': 'Blackburn', 'pos': 'NNP', 'chunk': 'I-NP', 'label': 'I-PER'}]

