# Microsoft - Teams File Sharing Security and Collaboration Insights

```SQL
CREATE TABLE dim_organization (
    organization_id INTEGER,
    organization_name VARCHAR,
    segment VARCHAR
);

CREATE TABLE fct_file_sharing (
    file_id INTEGER,
    file_name VARCHAR,
    organization_id INTEGER,
    shared_date DATE,
    co_editing_user_id INTEGER
);

INSERT INTO dim_organization (organization_id, organization_name, segment)
VALUES
    (1, 'AlphaCorp', 'Finance'),
    (2, 'BetaTech', 'Technology'),
    (3, 'GammaLLC', 'Healthcare'),
    (4, 'DeltaInc', 'Retail'),
    (5, 'EpsilonLtd', 'Government'),
    (6, 'ZetaPartners', 'Finance'),
    (7, 'EtaSolutions', 'Technology'),
    (8, 'ThetaSystems', 'Healthcare'),
    (9, 'IotaServices', 'Retail'),
    (10, 'KappaGlobal', 'Manufacturing');


INSERT INTO fct_file_sharing (file_id, file_name, shared_date, organization_id, co_editing_user_id)
VALUES
    (1, 'AlphaCorp-report', '2024-01-05', 1, 1001),
    (2, 'Summary', '2024-01-12', 2, NULL),
    (3, 'GammaLLC_DataAnalysis', '2024-01-20', 3, NULL),
    (4, 'DeltaInc_Notes', '2024-01-25', 4, 2002),
    (5, 'MeetingMinutes', '2024-01-30', 5, NULL),
    (6, 'AlphaCorp-Summary', '2024-02-03', 1, 3001),
    (7, 'BetaTech-Overview', '2024-02-10', 2, NULL),
    (8, 'DataSet', '2024-02-15', 3, 3003),
    (9, 'ZetaPartners-Quarterly', '2024-02-20', 6, NULL),
    (10, 'KappaGlobal-Plan', '2024-02-25', 10, 3005),
    (11, 'Proposal', '2024-03-05', 7, NULL),
    (12, 'ThetaSystems_Design', '2024-03-10', 8, 4001),
    (13, 'IotaServices-Update', '2024-03-15', 9, NULL),
    (14, 'DeltaIncStrategies', '2024-03-20', 4, NULL),
    (15, 'EpsilonLtd-Finance', '2024-03-25', 5, 4003),
    (16, 'EtaSolutions-Guide', '2024-02-28', 7, NULL),
    (17, 'EtaSolutions-Edit', '2024-01-21', 7, NULL),
    (18, 'AlphaCorp-Review', '2024-03-02', 1, NULL);

SELECT * FROM dim_organization;

SELECT * FROM fct_file_sharing;
```

In [1]:
import pandas as pd
import numpy as np

In [5]:
df_org = pd.read_csv('Data/017/dim_organization.csv')
df_file = pd.read_csv('Data/017/fct_file_sharing.csv')

df_org.head()

Unnamed: 0,organization_id,organization_name,segment
0,1,AlphaCorp,Finance
1,2,BetaTech,Technology
2,3,GammaLLC,Healthcare
3,4,DeltaInc,Retail
4,5,EpsilonLtd,Government


In [6]:
df_file.head()

Unnamed: 0,file_id,file_name,organization_id,shared_date,co_editing_user_id
0,1,AlphaCorp-report,1,2024-01-05,1001.0
1,2,Summary,2,2024-01-12,
2,3,GammaLLC_DataAnalysis,3,2024-01-20,
3,4,DeltaInc_Notes,4,2024-01-25,2002.0
4,5,MeetingMinutes,5,2024-01-30,


# Pregunta 

### ¿Cuál es la longitud promedio de los nombres de los archivos compartidos para cada segmento organizacional en enero de 2024?

In [8]:
# 1. Merge de las tablas
df_merged = df_file.merge(df_org, on='organization_id')

# 2. Filtro de enero
mask_enero = df_merged['shared_date'].between('2024-01-01', '2024-01-31')
df_enero = df_merged[mask_enero].copy()

# 3. Calcular longitud y agrupar
df_enero['name_length'] = df_enero['file_name'].str.len()
resultado = df_enero.groupby('segment')['name_length'].mean().reset_index()

resultado

Unnamed: 0,segment,name_length
0,Finance,16.0
1,Government,14.0
2,Healthcare,21.0
3,Retail,14.0
4,Technology,12.0


```SQL
SELECT 
    o.segment,
    AVG(LENGTH(f.file_name)) AS avg_name_length
FROM fct_file_sharing f
JOIN dim_organization o ON f.organization_id = o.organization_id
WHERE f.shared_date BETWEEN '2024-01-01' AND '2024-01-31'
GROUP BY o.segment;
```

# Pregunta 2

### ¿Cuántos archivos se compartieron cuyos nombres comienzan con el mismo prefijo que el nombre de la organización, concatenado con un guion, en febrero de 2024?

In [9]:
# 1. Merge
df_final = df_file.merge(df_org, on='organization_id')

# 2. Filtro de febrero
df_feb = df_final[df_final['shared_date'].between('2024-02-01', '2024-02-29')].copy()

# 3. Validar prefijo
def cumple_prefijo(row):
    prefijo_esperado = row['organization_name'] + "-"
    return row['file_name'].startswith(prefijo_esperado)

# 4. Contar resultados True
total = df_feb.apply(cumple_prefijo, axis=1).sum()

total

np.int64(5)

```SQL
SELECT 
    COUNT(*) AS standard_compliant_files
FROM fct_file_sharing f
JOIN dim_organization o ON f.organization_id = o.organization_id
WHERE f.shared_date BETWEEN '2024-02-01' AND '2024-02-29'
  AND f.file_name LIKE CONCAT(o.organization_name, '-%');

# Pregunta 3

### Identifique los 3 segmentos organizacionales con el mayor número de archivos compartidos donde el usuario de coedición sea NULL (nulo), lo que indica un posible riesgo de seguridad, durante el primer trimestre de 2024.

In [11]:
# 1. Merge y Filtro Q1
df_merged = df_file.merge(df_org, on='organization_id')
df_q1 = df_merged[df_merged['shared_date'].between('2024-01-01', '2024-03-31')]

# 2. Filtrar por el riesgo (NULLs)
# En pandas, NULL se identifica con .isna()
df_riesgo = df_q1[df_q1['co_editing_user_id'].isna()]

# 3. Agrupar, contar y sacar el Top 3
top_3_riesgo = (df_riesgo.groupby('segment')
                .size()
                .reset_index(name='count')
                .sort_values(by='count', ascending=False)
                .head(3))

top_3_riesgo

Unnamed: 0,segment,count
4,Technology,5
0,Finance,2
3,Retail,2


```SQL
SELECT 
    o.segment,
    COUNT(f.file_id) AS total_risky_files
FROM fct_file_sharing f
JOIN dim_organization o ON f.organization_id = o.organization_id
WHERE f.co_editing_user_id IS NULL 
  AND f.shared_date BETWEEN '2024-01-01' AND '2024-03-31'
GROUP BY o.segment
ORDER BY total_risky_files DESC
LIMIT 3;
```