In [1]:
import pandas as pd
from scipy.stats import randint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from wordcloud import WordCloud,STOPWORDS
import lxml
import joblib
import numpy as np
import spacy as sp
#import transformers

In [2]:
dataset=pd.read_csv("website_classification.csv")
df = dataset[['website_url','cleaned_website_text','Category']].copy()
df.head()

Unnamed: 0,website_url,cleaned_website_text,Category
0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


In [3]:
pd.DataFrame(df.Category.unique()).values

array([['Travel'],
       ['Social Networking and Messaging'],
       ['News'],
       ['Streaming Services'],
       ['Sports'],
       ['Photography'],
       ['Law and Government'],
       ['Health and Fitness'],
       ['Games'],
       ['E-Commerce'],
       ['Forums'],
       ['Food'],
       ['Education'],
       ['Computers and Technology'],
       ['Business/Corporate'],
       ['Adult']], dtype=object)

In [4]:
# Create a new column 'category_id' with encoded categories 
df['category_id'] = df['Category'].factorize()[0]
category_id_df = df[['Category', 'category_id']].drop_duplicates()


# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)

In [5]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

# We transform each cleaned_text into a vector
features = tfidf.fit_transform(df.cleaned_website_text).toarray()

labels = df.category_id



In [6]:
X = df['cleaned_website_text'] # Collection of text
y = df['Category'] # Target or the labels we want to predict

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, df['category_id'], 
                                                    test_size=0.25,
                                                    random_state = 0)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

m = LinearSVC().fit(tfidf_vectorizer_vectors, y_train)
m1=CalibratedClassifierCV(estimator=m,
                                        cv="prefit").fit(tfidf_vectorizer_vectors, y_train)




In [12]:
# Save the LinearSVC model
joblib.dump(m, "web_classifier.joblib")

# Save the CalibratedClassifierCV model
joblib.dump(m1, "cal_web_classifier.joblib")

# Save the Tf vectorizer
joblib.dump(fitted_vectorizer, "tf_vectorizer.joblib")

['tf_vectorizer.joblib']

In [14]:
from scraptool import ScrapTool
website='https://www.mit.edu/'
scraper = ScrapTool()


try:
    web=dict(scraper.visit_url(website))
    text = web.get('website_text','')
    t=fitted_vectorizer.transform([text])
    print(id_to_category[m1.predict(t)[0]])
    
except Exception as error:
    print(error)

Education


In [15]:
text



"MIT - Massachusetts Institute of TechnologyMassachusetts Institute of Technology Spotlight: Mar 28, 2025 Mar 28, 2025 Share:Skip to content ↓ Education Research Innovation Admissions + Aid Campus Life News Alumni About MIT More ↓ Admissions + Aid Campus Life News Alumni About MIT Menu ↓ Search Menu Explore websites, people, and locations Look up people by “last name sounds like” What are you looking for? See More Results Suggestions or feedback? MIT's response to government activity : Read the latest about government activity affecting MIT. Updates from campus : Read the latest from MIT and its leaders regarding events on campus. Spotlight: Mar 28, 2025 Students in the new class 16.811 (Advanced Manufacturing for Aerospace Engineers) design, build, and test an electric rocket engine turbopump. Zachary Cordero says they “don’t just learn how to solve a problem set — they learn how to be an engineer.” Full story Twitter Facebook MORE FROM THE MIT COMMUNITY IN THE MEDIA Cathy Fang spoke 

In [11]:
from scraptool import CleanText
cleaner = CleanText()
clean_text = cleaner.text_cleaning(text)
clean_text = cleaner.basic_cleaning(clean_text)
token_text = cleaner.tokenize_text(clean_text)
text_to_generate = cleaner.tokens_to_text(token_text)



NameError: name 'AutoTokenizer' is not defined

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')


def generate_text(prompt,max_length = 150):
    inputs_ids = tokenizer.encode(prompt, return_tensors ='pt')
    output = model.generate(inputs_ids, max_length = max_length)
    generate_text = tokenizer.decode(output[0],skip_special_tokens = True)
    return generate_text

prompt = "Summarize this text with key words"
generate_text = generate_text(prompt)
print(generate_text)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import torch

# Assume you have a pre-trained T5 tokenizer and model loaded
model_name = "gpt2"  # Or any other T5 variant
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Your list of tokens (as strings)
token_list = token_text

# 1. Convert the list of string tokens to token IDs
input_ids = tokenizer.convert_tokens_to_ids(token_list)

# 2. Convert the list of token IDs to a PyTorch tensor
input_tensor = torch.tensor([input_ids])  # Add a batch dimension

# 3. Generate text using the model
# You might need to adjust the generation parameters based on your needs
output_ids = model.generate(
    input_tensor,
    max_length=100,  # Maximum length of the generated text
    num_return_sequences=1,  # Generate a single sequence
    temperature=1.0,  # Controls the randomness of the output
    # Add other generation parameters as needed (e.g., top_k, top_p)
)

# 4. Decode the generated token IDs back to a text string
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

#print("List of Tokens:", token_list)
print("Generated Text:", generated_text)

In [None]:
'''
Similitudes entre 

news - business and corporate

Dificultades para clasificar

e-commerce

'''