In [11]:
!pip install python-crfsuite

Collecting python-crfsuite
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.1 MB[0m [31m12.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.10


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
df = pd.read_csv('news_train.csv')

In [3]:
df.head()

Unnamed: 0,headline,label,sentiment
0,LIC પોલિસી ધારકો માટે મોટી ખબર! પ્રીમિયમ જમા ક...,business,positive
1,"VIRAL VIDEO: મસ્તમૌલા અંદાજમાં રણવીર સિંહ, ચાર...",entertainment,positive
2,શાહરૂખની સાથે ફિલ્મમાં કામ કરી ચુક્યો છે અનુષ્...,entertainment,negative
3,"આજે અહીથી ખરીદો OnePlus 6, મળશે 25,000 રૂપિયા ...",tech,positive
4,સલમાન સાથે પંગા બાદ નથી મળતો 'સોરી' અને 'થેન્ક...,entertainment,negative


In [4]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,60
negative,40


In [5]:
# Function to remove HTML tags
def remove_html(text):
    soup = BeautifulSoup(text, 'lxml')
    html_free = soup.get_text()
    return html_free

In [6]:
def preprocess_text(text):
    # Remove HTML tags
    text = remove_html(text)
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove hashtags and @ symbols
    text = text.replace('@', '').replace('#', '')
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [7]:
# Function to generate N-grams
def generate_ngrams(text, n=1):
    words = word_tokenize(text)
    ngrams = [words[i:i+n] for i in range(len(words)-n+1)]
    return ngrams

In [8]:
gujarati_stopwords = ['અથવા', 'અને', 'અમને', 'અમારું', 'અમે', 'અહીં', 'આ', 'આગળ', 'આથી', 'આનું', 'આને', 'આપણને', 'આપણું', 'આપણે', 'આપી', 'આવી', 'આવે', 'ઉપર', 'ઊંચે', 'ઊભું', 'એ', 'એક', 'એના', 'એનાં', 'એની', 'એનું', 'એને', 'એનો', 'એમ', 'એવા', 'એવાં', 'એવી', 'એવું', 'એવો', 'ઓછું', 'અંગે', 'અંદર', 'કઈ', 'કયું', 'કયો, કરવું', 'કરતાં', 'કરી', 'કરીએ, કરું, કરે, કરેલું', 'કર્યા', 'કર્યાં', 'કર્યું', 'કર્યો', 'કંઈક', 'કાંઈ', 'કે', 'કેટલું', 'કેમ', 'કેવી', 'કેવું', 'કોઈ', 'કોઈક', 'કોણ', 'કોણે', 'કોને', 'ક્યારે', 'ક્યાં', 'ખૂબ', 'ગઈ','ગયા', 'ગયાં', 'ગયું', 'ગયો', 'ઘણું', 'છ', 'છતાં', 'છીએ', 'છું', 'છે', 'છેક', 'છો', 'જ', 'જાય', 'જી', 'જે' , 'જેટલું', 'જેને', 'જેમ' ,'જેવી',
'જેવું', 'જેવો', 'જો', 'જોઈએ', 'જ્યારે', 'જ્યાં', 'ઝાઝું', 'તને', 'તમને', 'તમારું', 'તમે', 'તારાથી', 'તારામાં','તારું', 'તું', 'તે', 'તેઓ', 'તેથી', 'તેણે', 'તેના', 'તેની', 'તેનું', 'તેને', 'તેમ', 'તેમનું', 'તેમને', 'તેવી', 'તેવું', 'તેં', 'તો', 'ત્યારે', 'ત્યાં', 'થઈ', 'થઈએ',  'થતા', 'થતાં', 'થતી', 'થતું', 'થતો', 'થયા', 'થયાં', 'થયું', 'થયો', 'થયેલું', 'થવું', 'થાઉં', 'થાઓ',  'થાય', 'થોડું', 'દરેક', 'ન', 'નથી', 'નહિ', 'નહીં', 'નં', 'ના', 'નીચે', 'ને', 'પછી', 'પણ', 'પર', 'પરંતુ', 'પહેલાં', 'પાછળ', 'પાસે', 'પોતાનું', 'પ્રત્યેક', 'ફક્ત', 'ફરી', 'ફરીથી', 'બધા','બધું', 'બની', 'બહાર', 'બહુ', 'બંને', 'બાદ', 'બે', 'મને', 'મા', 'માટે', 'માત્ર', 'મારું', 'મૂકવું', 'મૂકી', 'મૂક્યા', 'મૂક્યાં', 'મૂક્યું', 'મેં', 'રહી', 'રહે', 'રહેવું', 'રહ્યા', 'રહ્યાં','રહ્યો', 'રીતે', 'રૂ','લેતા','લેતું', 'લેવા','વગેરે', 'વધુ', 'શકે', 'શા', 'શું', 'સરખું', 'સામે', 'સુધી', 'હતા', 'હતાં', 'હતી', 'હતું', 'હશે', 'હશો', 'હવે', 'હા', 'હું', 'હો', 'હોઈ', 'હોઈશ', 'હોઈશું', 'હોય', 'હોવા',]

In [9]:
def process_ngrams(ngrams):
    # Flatten n-grams: list of lists/tuples to a single list
    flattened = [word[0] for word in ngrams]
    # Remove stopwords
    cleaned = [word for word in flattened if word not in gujarati_stopwords]
    return cleaned

In [14]:
# Function to perform POS tagging and extract adjectives, adverbs, and nouns
import posTagger as pt
tagger = pt.posTagger(corpus='prose')  # Use 'poetry' for poetry, 'prose' for prose
tagger.eval()  # Set the tagger in evaluation/inference mode

def extract_keywords(sentence):
    # Perform POS tagging
    pos_tags = tagger.pos_tag(sentence)

    # Define POS tags of interest based on the tagger's output
    pos_interested = ['N_NN', 'N_NNS', 'N_NNP', 'N_NNPS']  # Adapt based on actual tags used by your tagger

    # Extract keywords
    keywords = [word for word, pos in pos_tags if pos in pos_interested]

    return keywords

AttributeError: type object 'posTagger' has no attribute 'posTagger'

In [None]:
# Apply the preprocessing function to the headlines
df['headline'] = df['headline'].apply(preprocess_text)

# Generate N-grams (using unigrams here, can adjust n for bigrams, trigrams, etc.)
df['ngrams'] = df['headline'].apply(lambda x: generate_ngrams(x, 1))

# Flatten N-grams, remove stopwords
df['processed_ngrams'] = df['ngrams'].apply(process_ngrams)

# Extract keywords (adjectives, adverbs, and nouns)
df['keywords'] = df['processed_ngrams'].apply(lambda x: extract_keywords(x))
# print("keywords: ",df['keywords'])

# Prepare the data
x = df['keywords'].apply(lambda x: ' '.join(x)).tolist()
y = df['sentiment'].tolist()

AttributeError: 'function' object has no attribute 'pos_tag'

In [None]:
# Prepare the data
x = df['keywords'].apply(lambda x: ' '.join(x)).tolist()
y = df['sentiment'].tolist()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', LinearSVC()),
])

In [None]:
text_clf.fit(x_train, y_train)



In [None]:
# Save model
loaded_model = joblib.dump(text_clf, 'guju_model.pkl')

In [None]:
predictions = text_clf.predict(x_test)

In [None]:
from sklearn import metrics
accu = metrics.classification_report(y_test, predictions)
print(accu)

              precision    recall  f1-score   support

    negative       0.50      0.36      0.42        11
    positive       0.68      0.79      0.73        19

    accuracy                           0.63        30
   macro avg       0.59      0.58      0.58        30
weighted avg       0.62      0.63      0.62        30



In [None]:
simple_test = ["અજય દેવગણે કહ્યું- તે નારાજ હોય તો મને લાફો મારી શકે છે"]
pred1 = text_clf.predict(simple_test)
pred1

array(['negative'], dtype='<U8')