In [4]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob
from keras.models import Sequential
from keras.layers import Dense

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = text.split()
    return tokens

# Function to extract features
def extract_features(text):
    tokens = preprocess_text(text)

    # 1. Word Count
    word_count = len(tokens)

    # 2. Unique Word Count
    unique_word_count = len(set(tokens))

    # 3. Average Word Length
    avg_word_length = np.mean([len(word) for word in tokens]) if tokens else 0

    # 4. TF-IDF Features
    tfidf_vectorizer = TfidfVectorizer(max_features=10)
    tfidf_matrix = tfidf_vectorizer.fit_transform([text])
    tfidf_features = tfidf_matrix.toarray().flatten()

    # 5. Sentiment Score
    sentiment = TextBlob(text).sentiment
    sentiment_score = sentiment.polarity  # Polarity ranges from -1 to 1

    # 6. Topic Modeling Features (using LDA)
    lda = LatentDirichletAllocation(n_components=2, random_state=42)
    tfidf_for_lda = tfidf_vectorizer.fit_transform([text])
    lda_features = lda.fit_transform(tfidf_for_lda).flatten()

    # Combine features into a single vector
    combined_features = [
        word_count,
        unique_word_count,
        avg_word_length,
        sentiment_score,
        *tfidf_features,
        *lda_features
    ]
    return np.array(combined_features)

# Sample long text
sample_text = """
Deep learning has transformed the field of artificial intelligence. Its applications include natural language processing,
image recognition, and game playing. CNNs and RNNs are widely used for tasks requiring spatial and sequential data processing.
"""

# Extract features from the text
features = extract_features(sample_text)

# Normalize features for deep learning
features = np.array(features).reshape(1, -1)

# Deep Learning Model (Simple Example)
model = Sequential([
    Dense(16, input_dim=features.shape[1], activation='relu'),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification output
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print feature vector and model summary
print("Extracted Features:", features)
model.summary()



Extracted Features: [[34.         31.          6.17647059 -0.2         0.65465367  0.43643578
   0.21821789  0.21821789  0.21821789  0.21821789  0.21821789  0.21821789
   0.21821789  0.21821789  0.82499577  0.17500423]]


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
