In [6]:
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix


## Sentence Classification Using Supervised Learning 

### Generate Dataset 

In [13]:
def remove_punctuation(text):
    punct_list = list(string.punctuation)
    for punc in punct_list:
        if punc in text:
            text = text.replace(punc, ' ')
    return text.strip()

# Sample data
# Categories and corresponding sentences
categories = {
    "programming": [
        "Python is a high-level programming language.",
        "Machine learning algorithms can be implemented in Python.",
        "Java is popular for enterprise software development.",
        "JavaScript is widely used for web development.",
        "Data science involves analyzing large datasets.",
        "Python is a versatile and beginner-friendly programming language used for web development, data analysis, and artificial intelligence.",
        "JavaScript is a scripting language commonly used for building interactive websites and web applications.",
        "Java is a widely-used object-oriented programming language, known for its platform independence and versatility.",
        "C++ is a powerful programming language used for system software, game development, and high-performance applications.",
        "HTML and CSS are essential languages for creating and styling web pages on the internet.",
        "PHP is a server-side scripting language used for web development and creating dynamic web pages.",
        "Ruby on Rails is a popular web application framework written in Ruby, emphasizing convention over configuration.",
        "SQL (Structured Query Language) is a language used for managing and querying relational databases.",
        "Swift is a programming language developed by Apple for building iOS, macOS, watchOS, and tvOS applications.",
        "Go, also known as Golang, is a statically typed language developed by Google, designed for simplicity and efficiency.",
        "TypeScript is a superset of JavaScript that adds static typing and other features to enhance code maintainability and scalability.",
        "Rust is a systems programming language focused on safety, speed, and concurrency, developed by Mozilla.",
        "Kotlin is a modern programming language that runs on the Java Virtual Machine (JVM), widely used for Android app development.",
        "PHP is a server-side scripting language used for web development and creating dynamic web pages.",
        "MATLAB is a high-level programming language and interactive environment for numerical computation and visualization.",
        "R is a programming language and environment commonly used for statistical analysis and data visualization.",
        "Shell scripting involves writing scripts to automate tasks in Unix-like operating systems using shell commands.",
        "Assembly language is a low-level programming language used for writing programs that interact directly with hardware.",
        "Lisp is a family of programming languages known for their unique syntax and powerful features, including support for symbolic computation.",
        "Dart is a programming language developed by Google, used for building web, mobile, and desktop applications using the Flutter framework."
    ],
    "sports": [
        "Basketball is an exciting sport to watch.",
        "Football requires strength and teamwork.",
        "Tennis players need agility and precision.",
        "Swimming is a great way to stay fit.",
        "Golf is a relaxing sport played outdoors."
        "Soccer, also known as football in many countries, is the most popular sport globally.",
        "Basketball is a fast-paced team sport played on a rectangular court, involving shooting and dribbling.",
        "Tennis is a racket sport played individually or in doubles, with players hitting a ball over a net.",
        "Swimming is a full-body exercise and a popular recreational activity, with various strokes like freestyle and butterfly.",
        "Cricket is a bat-and-ball game played between two teams, popular in countries like India, England, and Australia.",
        "Golf is a precision sport where players use clubs to hit balls into a series of holes on a course.",
        "Running includes various disciplines like sprinting, long-distance running, and marathon races.",
        "Baseball is a bat-and-ball game played between two teams, with players aiming to score runs by hitting the ball and running around bases.",
        "Gymnastics involves performing acrobatic feats and routines on apparatus like bars, beams, and vaults.",
        "Rugby is a physical contact sport similar to football, played with an oval ball and involving tackling and passing.",
        "Cycling includes road racing, track cycling, and mountain biking, with events like the Tour de France attracting global attention.",
        "Volleyball is a team sport played with a ball over a net, requiring coordination and teamwork.",
        "Martial arts encompass a variety of combat practices and disciplines, including karate, judo, and taekwondo.",
        "Ice hockey is a fast-paced sport played on ice, with players using sticks to hit a puck into the opponent's goal.",
        "Figure skating combines elements of dance, gymnastics, and precision, performed on ice skates.",
        "Surfing involves riding waves on a board, with locations like Hawaii and California renowned for their surf spots.",
        "Skiing includes downhill, cross-country, and freestyle disciplines, popular in mountainous regions during winter.",
        "Sailing is a water sport involving navigating sailboats across bodies of water, with competitions like the America's Cup showcasing elite sailing.",
        "Horse racing involves horses running at high speeds on tracks, with events like the Kentucky Derby capturing widespread interest.",
        "Boxing is a combat sport where two opponents fight using their fists, with matches divided into rounds and regulated by rules and referees."
    ],
    "animals": [
        "Cats are independent animals.",
        "Dogs are known for their loyalty to humans.",
        "Elephants are the largest land animals.",
        "Birds have feathers and lay eggs.",
        "Lions are apex predators in the wild.",
        "Tigers are majestic creatures found in various habitats across Asia.",
        "Dolphins are highly intelligent marine mammals known for their playful behavior.",
        "Elephants are the largest land animals and are revered in many cultures.",
        "Penguins are flightless birds that thrive in cold climates, often found in Antarctica.",
        "Koalas are iconic marsupials native to Australia, known for their eucalyptus diet.",
        "Giraffes have long necks that help them reach high leaves in trees on the African savannah.",
        "Octopuses are remarkable cephalopods with complex nervous systems and camouflage abilities.",
        "Polar bears are well-adapted to Arctic environments, relying on sea ice for hunting.",
        "Monarch butterflies undertake long migrations across North America to their wintering grounds.",
        "Chimpanzees are our closest living relatives, sharing approximately 98% of our DNA.",
        "Honeybees play a vital role in pollinating crops and maintaining ecosystems.",
        "Red pandas are adorable arboreal mammals native to the forests of the Himalayas.",
        "Orcas, also known as killer whales, are apex predators of the ocean, known for their social behavior.",
        "Blue whales are the largest animals ever known to have existed, found in oceans worldwide.",
        "Red foxes are adaptable carnivores found in diverse habitats, from forests to urban areas.",
        "African elephants form close-knit family groups led by matriarchs, with strong social bonds.",
        "Snow leopards are elusive big cats adapted to life in the mountainous regions of Central Asia.",
        "Hummingbirds are tiny birds capable of hovering in mid-air, known for their iridescent plumage.",
        "Giant pandas are beloved symbols of conservation, native to bamboo forests in China.",
        "Sea turtles are ancient reptiles that migrate long distances between feeding and nesting grounds."
    ],
    "economics": [
        "Supply and demand determine prices in a market economy.",
        "Inflation erodes the purchasing power of money.",
        "Unemployment can lead to social and economic problems.",
        "GDP measures the total value of goods and services produced in a country.",
        "Monetary policy influences interest rates and money supply.",
        "The global economy is interconnected, affecting nations across borders.",
        "Fiscal policy involves government decisions on taxation and spending.",
        "Economic growth is crucial for increasing standards of living.",
        "Trade agreements facilitate the exchange of goods and services between countries.",
        "The stock market reflects investor sentiment and economic performance.",
        "Central banks regulate monetary policy to control inflation and stabilize currencies.",
        "Income inequality is a significant challenge in many economies.",
        "Technological advancements drive productivity and innovation in the economy.",
        "Consumer spending is a key driver of economic activity.",
        "Tariffs and trade barriers can disrupt international trade flows.",
        "Economic recessions are periods of negative growth and high unemployment.",
        "Economic indicators such as GDP and unemployment rates provide insights into economic health.",
        "International aid programs aim to alleviate poverty and promote economic development.",
        "Economic globalization has led to increased interconnectedness but also vulnerabilities.",
        "The housing market plays a crucial role in the overall economy.",
        "Economic sanctions are used to influence the behavior of countries on the global stage.",
        "Economic forecasting helps businesses and policymakers make informed decisions.",
        "Sustainable development balances economic growth with environmental protection.",
        "The gig economy is reshaping traditional employment structures.",
        "Economic policy debates often center around issues like taxation, regulation, and government intervention."
    ]
}

# Create corpus in required format
corpus = []
for category, sentences in categories.items():
    for sentence in sentences:
        sentence = remove_punctuation(sentence)
        corpus.append((sentence, category))

print(corpus)

[('Python is a high level programming language', 'programming'), ('Machine learning algorithms can be implemented in Python', 'programming'), ('Java is popular for enterprise software development', 'programming'), ('JavaScript is widely used for web development', 'programming'), ('Data science involves analyzing large datasets', 'programming'), ('Python is a versatile and beginner friendly programming language used for web development  data analysis  and artificial intelligence', 'programming'), ('JavaScript is a scripting language commonly used for building interactive websites and web applications', 'programming'), ('Java is a widely used object oriented programming language  known for its platform independence and versatility', 'programming'), ('C   is a powerful programming language used for system software  game development  and high performance applications', 'programming'), ('HTML and CSS are essential languages for creating and styling web pages on the internet', 'programming')

### Vectorize Dataset Using TF-IDF

In [None]:
# Split data into features and labels
X, y = zip(*corpus)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)

### Train  Support Vector Machine (SVM) Classifier Model

In [14]:
# Train SVM classifier
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     animals       1.00      0.71      0.83         7
   economics       0.75      1.00      0.86         3
 programming       1.00      0.83      0.91         6
      sports       0.67      1.00      0.80         4

    accuracy                           0.85        20
   macro avg       0.85      0.89      0.85        20
weighted avg       0.90      0.85      0.85        20



In [15]:
# Collect misclassified indices
misclassified_indices = [i for i, (true_label, predicted_label) in enumerate(zip(y_test, y_pred)) if true_label != predicted_label]

# Print misclassified sentences
for index in misclassified_indices:
    print(f"Actual Label: {y_test[index]}, Predicted Label: {y_pred[index]}, Sentence: {X_test[index]}")

Actual Label: programming, Predicted Label: economics, Sentence:   (0, 54)	0.31169966131139576
  (0, 55)	0.31169966131139576
  (0, 370)	0.26771498510350167
  (0, 372)	0.31169966131139576
  (0, 375)	0.31169966131139576
  (0, 767)	0.23220374027194834
  (0, 768)	0.31169966131139576
  (0, 862)	0.31169966131139576
  (0, 863)	0.31169966131139576
  (0, 1260)	0.31169966131139576
  (0, 1261)	0.31169966131139576
Actual Label: animals, Predicted Label: sports, Sentence:   (0, 5)	0.17874253579854185
  (0, 9)	0.21973130024301618
  (0, 46)	0.20159348303219948
  (0, 48)	0.21973130024301618
  (0, 254)	0.21973130024301618
  (0, 255)	0.21973130024301618
  (0, 612)	0.20159348303219948
  (0, 901)	0.17874253579854185
  (0, 904)	0.21973130024301618
  (0, 956)	0.21973130024301618
  (0, 957)	0.21973130024301618
  (0, 962)	0.21973130024301618
  (0, 963)	0.21973130024301618
  (0, 993)	0.21973130024301618
  (0, 994)	0.21973130024301618
  (0, 1445)	0.14773573247618202
  (0, 1454)	0.21973130024301618
  (0, 1463)	0

### Improve Model Performance Using Voting Classifier

In [20]:

# Train individual models (e.g., SVM, Random Forest, Logistic Regression)
model1 = SVC(kernel='linear', probability=True)
model2 = RandomForestClassifier(n_estimators=100)
model3 = LogisticRegression()

# Create a voting classifier with averaging
voting_clf = VotingClassifier(estimators=[('svm', model1), ('rf', model2), ('lr', model3)], voting='soft')

# Train the voting classifier on the training data
voting_clf.fit(X_train, y_train)

# Make predictions using the voting classifier
y_pred_voting = voting_clf.predict(X_test)

print(classification_report(y_test, y_pred_voting))

              precision    recall  f1-score   support

     animals       1.00      1.00      1.00         7
   economics       0.75      1.00      0.86         3
 programming       1.00      0.83      0.91         6
      sports       1.00      1.00      1.00         4

    accuracy                           0.95        20
   macro avg       0.94      0.96      0.94        20
weighted avg       0.96      0.95      0.95        20



In [21]:
# Collect misclassified indices
misclassified_indices = [i for i, (true_label, predicted_label) in enumerate(zip(y_test, y_pred_voting)) if true_label != predicted_label]

# Print misclassified sentences
for index in misclassified_indices:
    print(f"Actual Label: {y_test[index]}, Predicted Label: {y_pred_voting[index]}, Sentence: {X_test[index]}")

Actual Label: programming, Predicted Label: economics, Sentence:   (0, 54)	0.31169966131139576
  (0, 55)	0.31169966131139576
  (0, 370)	0.26771498510350167
  (0, 372)	0.31169966131139576
  (0, 375)	0.31169966131139576
  (0, 767)	0.23220374027194834
  (0, 768)	0.31169966131139576
  (0, 862)	0.31169966131139576
  (0, 863)	0.31169966131139576
  (0, 1260)	0.31169966131139576
  (0, 1261)	0.31169966131139576
