In [1]:
{
 "cells": [
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# NewsBot Midterm Notebook - Team InsightAI\n",
    "# Complete Colab-ready Notebook integrating Modules 1-8\n",
    "\n",
    "# 1. Install and import libraries\n",
    "!pip install kaggle pandas numpy spacy nltk matplotlib scikit-learn\n",
    "!python -m spacy download en_core_web_sm\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.metrics import classification_report\n",
    "import spacy\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "nltk.download('vader_lexicon')\n",
    "\nnlp = spacy.load('en_core_web_sm')\n",
    "stop_words = set(stopwords.words('english'))"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# 2. Load dataset (replace with your file path)\n",
    "df = pd.read_csv('newsbot_dataset.csv')\n",
    "print(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# 3. Preprocessing\n",
    "def preprocess(text):\n",
    "    text = text.lower()\n",
    "    tokens = word_tokenize(text)\n",
    "    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]\n",
    "    doc = nlp(' '.join(tokens))\n",
    "    lemmas = [token.lemma_ for token in doc]\n",
    "    return ' '.join(lemmas)\n",
    "\ndf['clean_content'] = df['content'].apply(preprocess)"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# 4. TF-IDF Feature Extraction\n",
    "vectorizer = TfidfVectorizer(max_features=1000)\n",
    "X_tfidf = vectorizer.fit_transform(df['clean_content'])\n",
    "print(\"TF-IDF shape:\", X_tfidf.shape)"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# 5. POS Tagging\n",
    "df['pos_tags'] = df['clean_content'].apply(lambda x: [(token.text, token.pos_) for token in nlp(x)])"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# 6. Syntax Parsing\n",
    "df['dependency'] = df['clean_content'].apply(lambda x: [(token.text, token.dep_, token.head.text) for token in nlp(x)])"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# 7. Sentiment Analysis\n",
    "sia = SentimentIntensityAnalyzer()\n",
    "df['sentiment'] = df['clean_content'].apply(lambda x: sia.polarity_scores(x)['compound'])"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# 8. Multi-class Text Classification\n",
    "X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['category'], test_size=0.2, random_state=42)\n",
    "\n",
    "# SVM\n",
    "svm = SVC()\n",
    "svm.fit(X_train, y_train)\n",
    "y_pred_svm = svm.predict(X_test)\n",
    "print(\"SVM Results:\n\", classification_report(y_test, y_pred_svm))\n",
    "\n",
    "# Naive Bayes\n",
    "nb = MultinomialNB()\n",
    "nb.fit(X_train, y_train)\n",
    "y_pred_nb = nb.predict(X_test)\n",
    "print(\"Naive Bayes Results:\n\", classification_report(y_test, y_pred_nb))"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# 9. Named Entity Recognition (NER)\n",
    "df['entities'] = df['content'].apply(lambda x: [(ent.text, ent.label_) for ent in nlp(x).ents])"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# 10. Visualizations\n",
    "category_counts = df['category'].value_counts()\n",
    "plt.figure(figsize=(8,5))\n",
    "category_counts.plot(kind='bar')\n",
    "plt.title('Number of Articles per Category')\n",
    "plt.xlabel('Category')\n",
    "plt.ylabel('Count')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


{'cells': [{'cell_type': 'code',
   'metadata': {},
   'source': ['# NewsBot Midterm Notebook - Team InsightAI\n',
    '# Complete Colab-ready Notebook integrating Modules 1-8\n',
    '\n',
    '# 1. Install and import libraries\n',
    '!pip install kaggle pandas numpy spacy nltk matplotlib scikit-learn\n',
    '!python -m spacy download en_core_web_sm\n',
    '\n',
    'import pandas as pd\n',
    'import numpy as np\n',
    'import os\n',
    'import nltk\n',
    'from nltk.corpus import stopwords\n',
    'from nltk.tokenize import word_tokenize\n',
    'from nltk.sentiment.vader import SentimentIntensityAnalyzer\n',
    'from sklearn.feature_extraction.text import TfidfVectorizer\n',
    'from sklearn.model_selection import train_test_split\n',
    'from sklearn.svm import SVC\n',
    'from sklearn.naive_bayes import MultinomialNB\n',
    'from sklearn.metrics import classification_report\n',
    'import spacy\n',
    'import matplotlib.pyplot as plt\n',
    '\n',
    "nltk.downloa