<a href="https://colab.research.google.com/github/kedarnathkedu/NLP_projects/blob/main/crime_Analysis_nlp_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# NLP Libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation

# Advanced NLP
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import spacy
from wordcloud import WordCloud
import torch


In [None]:

# Download NLTK resources
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('vader_lexicon', quiet=True)
except:
    print("NLTK downloads may have failed - continuing anyway")

In [None]:
class CrimeNLPAnalyzer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.sentiment_analyzer = None
        self.hate_tokenizer = None
        self.hate_model = None
        self.nlp = None

        # Initialize models
        self._initialize_models()

    def _initialize_models(self):
        """Initialize NLP models"""
        try:
            # Sentiment Analysis
            self.sentiment_analyzer = pipeline('sentiment-analysis',
                                             model="cardiffnlp/twitter-roberta-base-sentiment-latest")

            # Hate Speech Detection
            self.hate_tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain")
            self.hate_model = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain")

            # SpaCy NER
            try:
                self.nlp = spacy.load('en_core_web_sm')
            except:
                print("SpaCy model not found. Install with: python -m spacy download en_core_web_sm")

        except Exception as e:
            print(f"Model initialization warning: {e}")


In [None]:
ncrb_path = '/content/NCRB%3A_Crime_Committed_by_Juveniles_(IPC_%26_SLL)_%E2%80%90_2018%E2%80%902022.csv'
cyber_path = '/content/NCRB_Table_9A.11.csv'
hate_path = '/content/Indo-HateSpeech_Dataset.xlsx'

df_ncrb = pd.read_csv(ncrb_path)
df_cyber = pd.read_csv(cyber_path)
df_hate = pd.read_excel(hate_path)

display(df_ncrb.head(3))
display(df_cyber.head(3))
display(df_hate.head(3))

Unnamed: 0,objectid,statename,id,lgd_statecode,censuscode2011,juv_crime_2018,juv_crime_2019,juv_crime_2020,juv_crime_2021,juv_crime_2022,child_pop2011,juv_crimerate_2022,st_areashape,st_lengthshape
0,1,Andaman and Nicobar Islands,35,35,35.0,25.0,19.0,20,11,24,1.1,21.4,7926498000.0,3627148.0
1,2,Andhra Pradesh,28,28,28.0,966.0,820.0,759,934,912,151.1,6.0,177286200000.0,5436285.0
2,3,Arunachal Pradesh,12,12,12.0,31.0,24.0,15,8,15,5.9,2.5,105791400000.0,3019452.0


Unnamed: 0,Sl. No.,State/UT,"Cyber Blackmailing/ Threatening/ Harassment (Sec.506,503, 384 IPC r/w IT Act",Fake Profile (IT Act r/w IPC/SLL),Cyber Pornography/ Hosting or Publishing Obscene Sexual Materials depicting children (Sec.67B of IT Act r/w other IPC/SLL),Cyber Stalking/ Bullying (Sec.354D IPC r/w IT Act),Internet Crimes through Online Games etc. (Sec.305 IPC r/w IT Act),Other Cyber Crimes against Children,Total Cyber Crimes against Children
0,1,Andhra Pradesh,0,0,99,13,0,12,124
1,2,Arunachal Pradesh,0,0,0,0,0,0,0
2,3,Assam,62,0,24,0,0,14,100


Unnamed: 0,Source.Name,Column1,Column2,Date,Likes,Comment,(view source),Post ID,Label
0,PostID 01 instagram-comments66804c78ca6bc-BxOs...,1.0,,2019-05-11 15:43:38,,Nice buro yes I am Hussain khan is me happy ow...,https://www.instagram.com/p/BxOsV8Gnauf/c/1786...,1.0,'HS0'
1,PostID 01 instagram-comments66804c78ca6bc-BxOs...,,1-1,2021-04-13 07:27:52,1.0,@hussainkhansadab teri maa chodate,https://www.instagram.com/p/BxOsV8Gnauf/c/1786...,1.0,'HS1'
2,PostID 01 instagram-comments66804c78ca6bc-BxOs...,,1-2,2021-07-04 17:04:03,2.0,@hussainkhansadab mohhamad and allah mi maa ki...,https://www.instagram.com/p/BxOsV8Gnauf/c/1786...,1.0,'HS1'



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [None]:

# display(df_ncrb.head(3))
# display(df_cyber.head(3))
display(df_hate.head(3))

Unnamed: 0,Source.Name,Column1,Column2,Date,Likes,Comment,(view source),Post ID,Label
0,PostID 01 instagram-comments66804c78ca6bc-BxOs...,1.0,,2019-05-11 15:43:38,,Nice buro yes I am Hussain khan is me happy ow...,https://www.instagram.com/p/BxOsV8Gnauf/c/1786...,1.0,'HS0'
1,PostID 01 instagram-comments66804c78ca6bc-BxOs...,,1-1,2021-04-13 07:27:52,1.0,@hussainkhansadab teri maa chodate,https://www.instagram.com/p/BxOsV8Gnauf/c/1786...,1.0,'HS1'
2,PostID 01 instagram-comments66804c78ca6bc-BxOs...,,1-2,2021-07-04 17:04:03,2.0,@hussainkhansadab mohhamad and allah mi maa ki...,https://www.instagram.com/p/BxOsV8Gnauf/c/1786...,1.0,'HS1'


In [None]:
df_ncrb = df_ncrb.dropna(how='all').fillna(0)
df_ncrb.columns = df_ncrb.columns.str.lower().str.replace(' ', '_')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess(text):
    if pd.isna(text): return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = [lemmatizer.lemmatize(w) for w in word_tokenize(text) if w not in stop_words]
    return ' '.join(tokens)



In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(df_hate['Comment'])


ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
nlp = spacy.load('en_core_web_sm')
sentiment_analyzer = pipeline('sentiment-analysis')

# NER Example
# text = df_cyber.iloc[text_col]
# doc = nlp(text)
# [(ent.text, ent.label_) for ent in doc.ents]

# Sentiment Example
# sentiment_analyzer(text)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [None]:
# Hate Speech Model (Hugging Face)
htokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain")
hmodel = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain")
import torch
def detect_hate_speech(text):
    inputs = htokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        logits = hmodel(**inputs).logits
        proba = torch.nn.functional.softmax(logits, dim=-1)
    idx = proba.argmax().item()
    # 0=Normal, 1=Offensive, 2=Hate
    return ['Normal', 'Offensive', 'Hate'][idx], proba[idx].item()

tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
# Visualizations (WordCloud, Plotly/Seaborn)
# Example: WordCloud
 text_data = ' '.join(df_cyber['processed_text'].dropna().values)
 wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text_data)
 plt.imshow(wordcloud, interpolation='bilinear'); plt.axis('off'); plt.show()

IndentationError: unexpected indent (ipython-input-10805460.py, line 3)