# Análise de Recomendações de Companhias Aéreas

Este notebook realiza:
- Exploração dos dados
- Pré-processamento (tabular e texto)
- Construção de três modelos:
    1. Apenas dados tabulares
    2. Apenas dados textuais
    3. Combinação tabular + texto
- Avaliação dos modelos com métricas
- Visualizações e análise das palavras mais importantes


In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack


In [None]:
# Carregar dataset
file_path = 'airlines_reviews.csv'
df = pd.read_csv(file_path)
df.head()


In [None]:
# Pré-processamento
# Converter variável alvo
print(df.columns)
df['Recommended'] = df['Recommended'].map({'yes': 1, 'no': 0})

# Selecionar colunas
text_col = 'Reviews'
tabular_cols = ['Airline','Type of Traveller','Month Flown','Route','Class','Seat Comfort','Staff Service','Food & Beverages','Inflight Entertainment','Value For Money','Overall Rating']

X_text = df[text_col].astype(str)
X_tabular = df[tabular_cols]
y = df['Recommended']

# Divisão treino/teste
X_tab_train, X_tab_test, X_text_train, X_text_test, y_train, y_test = train_test_split(X_tabular, X_text, y, test_size=0.2, random_state=42)


In [None]:
# Pipeline para dados tabulares
numeric_features = ['Seat Comfort','Staff Service','Food & Beverages','Inflight Entertainment','Value For Money','Overall Rating']
categorical_features = ['Airline','Type of Traveller','Month Flown','Route','Class']

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),('cat', categorical_transformer, categorical_features)])

model_tabular = Pipeline(steps=[('preprocessor', preprocessor),('classifier', RandomForestClassifier(random_state=42))])
model_tabular.fit(X_tab_train, y_train)
y_pred_tab = model_tabular.predict(X_tab_test)

print('Accuracy:', accuracy_score(y_test, y_pred_tab))
print('F1:', f1_score(y_test, y_pred_tab))


In [None]:
# Modelo texto
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_text_train_vec = vectorizer.fit_transform(X_text_train)
X_text_test_vec = vectorizer.transform(X_text_test)

model_text = LogisticRegression(max_iter=1000)
model_text.fit(X_text_train_vec, y_train)
y_pred_text = model_text.predict(X_text_test_vec)

print('Accuracy:', accuracy_score(y_test, y_pred_text))
print('F1:', f1_score(y_test, y_pred_text))

# Top palavras
feature_names = np.array(vectorizer.get_feature_names_out())
coef = model_text.coef_[0]
print('Top recomenda:', feature_names[np.argsort(coef)[-20:]])
print('Top não recomenda:', feature_names[np.argsort(coef)[:20]])


In [None]:
# Combinação tabular + texto
X_tab_train_proc = preprocessor.fit_transform(X_tab_train)
X_tab_test_proc = preprocessor.transform(X_tab_test)
X_comb_train = hstack([X_tab_train_proc, X_text_train_vec])
X_comb_test = hstack([X_tab_test_proc, X_text_test_vec])

model_comb = LogisticRegression(max_iter=1000)
model_comb.fit(X_comb_train, y_train)
y_pred_comb = model_comb.predict(X_comb_test)

print('Accuracy:', accuracy_score(y_test, y_pred_comb))
print('F1:', f1_score(y_test, y_pred_comb))
