# Tensorflow README Processor


In [None]:
!pip install tensorflow pandas numpy matplotlib scikit-learn requests beautifulsoup4 tqdm

import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm

print(f'TensorFlow version: {tf.__version__}')

## 2. Data Collection from GitHub

We'll create a `GitHubDataCollector` class to handle API requests and collect README files from popular repositories in our target languages

In [None]:
import requests
import base64
import time
from typing import List, Dict, Any

class GitHubDataCollector:
    def __init__(self, token: str = None):
        self.headers = {}
        if token:
            self.headers['Authorization'] = f'token {token}'
        self.base_url = 'https://api.github.com'
        self.rate_limit_remaining = 5000  # Default GitHub API limit
        self.rate_limit_reset = 0
    
    def _check_rate_limit(self):
        """Check and handle API rate limits"""
        if self.rate_limit_remaining <= 1:
            wait_time = max(0, self.rate_limit_reset - time.time())
            if wait_time > 0:
                print(f'Rate limit reached. Waiting {wait_time:.0f} seconds...')
                time.sleep(wait_time)
    
    def _update_rate_limit(self, response: requests.Response):
        """Update rate limit info from response headers"""
        self.rate_limit_remaining = int(response.headers.get('X-RateLimit-Remaining', 0))
        self.rate_limit_reset = int(response.headers.get('X-RateLimit-Reset', 0))
    
    def get_popular_repos(self, languages: List[str], stars: int = 1000) -> List[Dict[str, Any]]:
        """Get popular repositories for given languages with error handling"""
        repos = []
        for lang in tqdm(languages, desc='Fetching repositories'):
            try:
                self._check_rate_limit()
                query = f'language:{lang} stars:>{stars}'
                url = f'{self.base_url}/search/repositories?q={query}&sort=stars&per_page=100'
                response = requests.get(url, headers=self.headers)
                response.raise_for_status()
                self._update_rate_limit(response)
                
                data = response.json()
                repos.extend(data.get('items', []))
            except requests.exceptions.RequestException as e:
                print(f'Error fetching {lang} repositories: {e}')
        return repos
    
    def get_readme(self, owner: str, repo: str) -> str:
        """Get README content with error handling"""
        try:
            self._check_rate_limit()
            url = f'{self.base_url}/repos/{owner}/{repo}/readme'
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            self._update_rate_limit(response)
            
            content = response.json()['content']
            return base64.b64decode(content).decode('utf-8')
        except requests.exceptions.RequestException as e:
            print(f'Error fetching README for {owner}/{repo}: {e}')
        except (KeyError, UnicodeDecodeError) as e:
            print(f'Error processing README for {owner}/{repo}: {e}')
        return None

## Executing the Data Collection


In [None]:
# Initialize collector
collector = GitHubDataCollector()  # Add token='your_token' for authenticated requests

# Define target languages and minimum stars
languages = ['python', 'javascript', 'java', 'go', 'rust']
min_stars = 5000

# Collect repositories
print(f'Collecting repositories with at least {min_stars} stars...')
repos = collector.get_popular_repos(languages, stars=min_stars)
print(f'Found {len(repos)} repositories')

# Collect READMEs and create dataset
data = []
for repo in tqdm(repos, desc='Collecting READMEs'):
    try:
        readme = collector.get_readme(repo['owner']['login'], repo['name'])
        if readme:
            data.append({
                'name': repo['name'],
                'language': repo['language'],
                'stars': repo['stargazers_count'],
                'text': readme
            })
    except Exception as e:
        print(f'Error processing {repo["name"]}: {e}')

# Create DataFrame
df = pd.DataFrame(data)

# Display dataset statistics
print('\nDataset Statistics:')
print(f'Total samples: {len(df)}')
print('\nSamples per language:')
print(df['language'].value_counts())

## Text Preprocessing

In [None]:
import re
from bs4 import BeautifulSoup

def preprocess_readme(text: str) -> str:
    if not text:
        return ''
    
    # Remove code blocks
    text = re.sub(r'```[^`]*```', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    
    # Remove markdown links
    text = re.sub(r'\[([^\[]+)\]\([^\)]+\)', r'\1', text)
    
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # Remove special characters and extra whitespace
    text = re.sub(r'[^\w\s]', ' ', text)
    text = ' '.join(text.split())
    
    return text.lower()

# Preprocess all README texts
print('Preprocessing README texts...')
df['processed_text'] = df['text'].apply(preprocess_readme)

# Display example of preprocessed text
print('Example of preprocessed text:')
print('Original:', df['text'].iloc[0][:200], '...')
print('Preprocessed:', df['processed_text'].iloc[0][:200], '...')

## Text Vectorization with TensorFlow

In [None]:
# Create the text vectorization layer
max_features = 10000  # Maximum number of words to keep
sequence_length = 500  # Length of each sequence

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

# Adapt the layer to the text data
print('Adapting vectorization layer to the text data...')
vectorize_layer.adapt(df['processed_text'].values)

# Create training and validation sets
X = df['processed_text'].values
y = pd.get_dummies(df['language']).values  # One-hot encode the labels

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Training samples: {len(X_train)}')
print(f'Validation samples: {len(X_val)}')

## Building the Deep Learning Model

In [None]:
# Define the model
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(max_features + 1, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(languages), activation='softmax')
])

# Compile the model
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Model summary
model.summary()

print('Model Architecture Explanation:')
print('1. TextVectorization: Converts text to sequences of integer tokens')
print('2. Embedding: Converts tokens to dense vectors of size 64')
print('3. Bidirectional LSTM: Processes sequences in both directions')
print('4. Dense + Dropout: Final classification layers with regularization')
print('5. Output: Probability distribution over programming languages')


## Training the Classifier

In [None]:
# Training parameters
epochs = 20
batch_size = 16

# Callbacks for training
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=6,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        'best_model.keras',
        monitor='val_accuracy',
        save_best_only=True
    ),
    tf.keras.callbacks.TensorBoard(
        log_dir='./logs',
        histogram_freq=1
    )
]

# Train the model
print('Training the model...')
history = model.fit(
    X_train,
    y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_val, y_val),
    callbacks=callbacks
)


## Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Get predictions
y_pred = model.predict(X_val)
y_pred_classes = np.argmax(y_pred, axis=1)
y_val_classes = np.argmax(y_val, axis=1)

# Print classification report
print('Classification Report:')
print(classification_report(y_val_classes, y_pred_classes, target_names=languages))

# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_val_classes, y_pred_classes)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=languages, yticklabels=languages)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Calculate per-class accuracy
class_accuracy = cm.diagonal() / cm.sum(axis=1)
print('Per-class Accuracy:')
for lang, acc in zip(languages, class_accuracy):
    print(f'{lang}: {acc:.2%}')


## Testing with Real Examples

In [None]:
def predict_language(text):
    # Preprocess the text
    processed_text = preprocess_readme(text)
    
    # Convert to tf.data.Dataset
    input_ds = tf.data.Dataset.from_tensor_slices([processed_text]).batch(1)
    
    # Get prediction
    pred = model.predict(input_ds)[0]
    
    # Return probabilities for each language
    return {lang: float(prob) for lang, prob in zip(languages, pred)}

# Test with some popular repositories
test_repos = [
    ('tensorflow/tensorflow', 'Python'),
    ('vuejs/vue', 'JavaScript'),
    ('golang/go', 'Go'),
    ('rust-lang/rust', 'Rust'),
    ('spring-projects/spring-boot', 'Java')
]

for repo, expected_lang in test_repos:
    owner, name = repo.split('/')
    readme = collector.get_readme(owner, name)
    if readme:
        predictions = predict_language(readme)
        print(f'Repository: {repo}')
        print(f'Expected: {expected_lang}')
        print('Predictions:')
        for lang, prob in sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:3]:
            print(f'{lang}: {prob:.2%}')


## Saving and Deploying the Model

In [None]:
# Save the complete model
model.save('readme_classifier_model.keras')

# Save the vectorization configuration
import pickle
with open('vectorizer_config.pkl', 'wb') as f:
    pickle.dump({
        'config': vectorize_layer.get_config(),
        'weights': vectorize_layer.get_weights()
    }, f)

print('Model and vectorizer saved successfully!')

# Example of loading and using the saved model
print('Loading and testing saved model...')
loaded_model = tf.keras.models.load_model('readme_classifier_model.keras')

# Test with a simple Python code snippet
sample_text = 'import tensorflow as tf'
predictions = predict_language(sample_text)
print('Test prediction with loaded model:')
for lang, prob in sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:3]:
    print(f'{lang}: {prob:.2%}')
