In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk import FreqDist

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Example raw dataset loading
train = pd.read_csv('train.csv')

# Function to clean text
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove non-alphanumeric characters (punctuation, etc.)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    return text

# Apply text cleaning to the raw dataset
train['text'] = train['text'].apply(clean_text)

# Function to calculate positive and negative scores (simple example using lexicons)
def sentiment_scores(text):
    positive_words = ['good', 'great', 'awesome', 'positive', 'happy']
    negative_words = ['bad', 'sad', 'angry', 'negative', 'hate']
    
    positive_score = sum(1 for word in text.split() if word in positive_words)
    negative_score = sum(1 for word in text.split() if word in negative_words)
    
    return positive_score, negative_score

# Apply sentiment score calculation
train['positive_score'], train['negative_score'] = zip(*train['text'].apply(sentiment_scores))

# Calculate text length
train['text_length'] = train['text'].apply(len)

# Entity count (example)
# Let's count words with proper nouns (simplified entity extraction)
def entity_count(text):
    tagged = pos_tag(word_tokenize(text))
    return sum(1 for word, tag in tagged if tag == 'NNP')  # NNP -> Proper Noun

train['entity_count'] = train['text'].apply(entity_count)

# Part of Speech Ratios (Adjective, Verb, Noun)
def pos_ratios(text):
    tagged = pos_tag(word_tokenize(text))
    adj = sum(1 for word, tag in tagged if tag == 'JJ')  # Adjective
    verb = sum(1 for word, tag in tagged if tag.startswith('VB'))  # Verb
    noun = sum(1 for word, tag in tagged if tag.startswith('NN'))  # Noun
    
    total_tokens = len(tagged)
    adj_ratio = adj / total_tokens if total_tokens > 0 else 0
    verb_ratio = verb / total_tokens if total_tokens > 0 else 0
    noun_ratio = noun / total_tokens if total_tokens > 0 else 0
    
    return adj_ratio, verb_ratio, noun_ratio

train['adj_ratio'], train['verb_ratio'], train['noun_ratio'] = zip(*train['text'].apply(pos_ratios))

# Negation flag (simple check for negations like 'not', 'never')
def negation_flag(text):
    negations = ['not', 'never', 'no']
    return 1 if any(neg in text for neg in negations) else 0

train['negation_flag'] = train['text'].apply(negation_flag)

# Final preprocessed data
df = train[['text', 'sentiment', 'text_length', 'positive_score', 'negative_score', 'entity_count', 'adj_ratio', 'verb_ratio', 'noun_ratio', 'negation_flag']]

# Show the preprocessed data
print(df.head())
