In [243]:
import json
import pandas as pd

# Reading the JSON file containing metadata
json_file_path = "pavbhaji.json"

# Initializing lists to store text and labels
texts = []
labels = []

#extracted data with UTF-8 encoding
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)


In [244]:
data=pd.DataFrame(data)

In [245]:
# Splitting the 'display_url' column on the last '/' to match it to images in folder
data['display_url'] = data['display_url'].str.rsplit('/', 1).str[-1]


In [246]:
import os
#paths to the "PB"-Pav Bhaji and "NPB"-Not Pav Bhaji folders
pb_folder = "PB"  
npb_folder = "NPB"
# Creating a list of image file names in the "PB" and "NPB" folder
pb_images = os.listdir(pb_folder)
npb_images = os.listdir(npb_folder)
# Created two new columns for image namesin two folders
data['PB'] = ', '.join(pb_images)  
data['NPB'] = ', '.join(npb_images)  


In [247]:
# Defined a function to assign labels based on 'display_url'
def assign_label(row):
    if row['display_url'] in row['PB']:
        return 1  #If image is a pav bhaji it'll display 1 
    elif row['display_url'] in row['NPB']:
        return 0 
    else:
        return -1  #No match

# Next Created the 'label' column
data['label'] = data.apply(assign_label, axis=1)

data.head(2)


Unnamed: 0,dimensions,display_url,edge_liked_by,edge_media_preview_like,edge_media_to_caption,edge_media_to_comment,id,is_video,location,owner,...,tags,taken_at_timestamp,thumbnail_resources,thumbnail_src,urls,video_view_count,comments_disabled,PB,NPB,label
0,"{'height': 734, 'width': 640}",37599530_291077411445016_3382263118984904704_n...,{'count': 3797},{'count': 3797},{'edges': [{'node': {'text': 'TAG A PAV BHAJI ...,{'count': 52},1834712933156555738,True,,{'id': '1919686029'},...,"[vadapav, foodgram, foodphotography, foodblogg...",1532934873,"[{'config_height': 150, 'config_width': 150, '...",https://instagram.fpnq3-1.fna.fbcdn.net/vp/cb5...,[https://instagram.fpnq3-1.fna.fbcdn.net/vp/89...,0.0,,16228666_180901469054785_6854217108004274176_n...,24274012_2042365482666908_6941195371183865856_...,-1
1,"{'height': 750, 'width': 750}",36848355_187489742110497_7922878806031859712_n...,{'count': 12041},{'count': 12041},{'edges': [{'node': {'text': 'देसी स्टाइल पाव ...,{'count': 325},1826000656302706137,True,"{'has_public_page': True, 'id': '245717485', '...",{'id': '1445587278'},...,"[healthyfood, sokolkata, mumbaifoodie, faridab...",1531897016,"[{'config_height': 150, 'config_width': 150, '...",https://instagram.fpnq3-1.fna.fbcdn.net/vp/ad3...,[https://instagram.fpnq3-1.fna.fbcdn.net/vp/9c...,0.0,,16228666_180901469054785_6854217108004274176_n...,24274012_2042365482666908_6941195371183865856_...,-1


In [248]:
label_counts = data['label'].value_counts()
print("Label Value Counts:\n", label_counts)

Label Value Counts:
 -1    1048
 0     269
 1     183
Name: label, dtype: int64


In [249]:
#Took post description as a feature
df=data[['edge_media_to_caption','label']]

In [250]:
df.rename(columns={'edge_media_to_caption': 'description'}, inplace=True)
df.head(2)

Unnamed: 0,description,label
0,{'edges': [{'node': {'text': 'TAG A PAV BHAJI ...,-1
1,{'edges': [{'node': {'text': 'देसी स्टाइल पाव ...,-1


In [251]:
# Remove records with label -1 i.e. not a match with either folders
df = df[df['label'] != -1]
df

Unnamed: 0,description,label
17,{'edges': [{'node': {'text': 'Chicken Tikka 😍😋...,0
18,"{'edges': [{'node': {'text': 'Hello frandz,\np...",1
19,{'edges': [{'node': {'text': 'Follow @dillicio...,0
20,{'edges': [{'node': {'text': 'We’ve got you so...,0
21,{'edges': [{'node': {'text': 'People who love ...,0
...,...,...
503,{'edges': [{'node': {'text': 'Pav bhaji again ...,1
504,{'edges': [{'node': {'text': '#pavbhaji 😋 #lun...,1
505,{'edges': [{'node': {'text': 'Craving for some...,1
506,"{'edges': [{'node': {'text': 'Dear friends, \n...",0


In [252]:
label_counts = df['label'].value_counts()
print("Label Value Counts:\n", label_counts)

Label Value Counts:
 0    269
1    183
Name: label, dtype: int64


In [253]:
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")
# Basic Preprocessing of text data-remove special characters, lowercase etc.
def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = text.lower()
    return text

# Converting all values in the 'description' column to strings to prevent error
df['description'] = df['description'].astype(str)

# Applying text preprocessing to the 'description' column
df['description'] = df['description'].apply(preprocess_text)
#Removing irrelevant text visible
df['description'] = df['description'].str.replace('edges', '').str.replace('node', '').str.replace('text', '')
df.head(2)

Unnamed: 0,description,label
17,chicken tikka nnnnfollow dillikiteekhimirch...,0
18,hello frandznpav bhaji khaalongaram hai nye...,1


In [264]:
X = df['description']
y = df['label']

# Splitting 70-30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Created TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# Transformed the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Applying BASELINE Logistic Regression model. We can improve the accuracy by implementing more complex models or techniquesa
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Accuracy: 0.60
              precision    recall  f1-score   support

           0       0.58      0.95      0.72        74
           1       0.75      0.19      0.31        62

    accuracy                           0.60       136
   macro avg       0.67      0.57      0.51       136
weighted avg       0.66      0.60      0.53       136



In [265]:
#################################   ManualTesting of the model with different examples    ##################################### 
def preprocess_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = text.lower()
    return text

# Input your text data here whatever you want to test
input_text = "edges node text straight vadapao"

# Preprocessing
input_text = preprocess_text(input_text)
# Creating a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_vectorizer.fit(X_train)
# Transforming the input text using the trained vectorizer
input_text_tfidf = tfidf_vectorizer.transform([input_text])
# Using the trained model to predict the class
predicted_class = model.predict(input_text_tfidf)
# Mapping the predicted class label to a meaningful output
if predicted_class[0] == 1:
    prediction_result = "pavbhaji"
else:
    prediction_result = "not pavbhaji"
print(f"The input text is classified as: {prediction_result}")


The input text is classified as: not pavbhaji
